################################# Stability Definition #########################################
pdz3_final_df_ddg <- read_csv('../data/cleaned_ddg/pdz3_ddg_cleaned_mochi_refit.csv')
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl  (1): binding_interface_5A
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(pdz3_final_df_ddg)
## [1] 3154
pdz3_final_df_ddgf <- pdz3_final_df_ddg %>% filter(trait_name == "Folding") %>% 
  dplyr::rename(f_ddg_pred = ddg,
                f_ddg_pred_sd = std_ddg)
nrow(pdz3_final_df_ddgf)
## [1] 1587
pdz3_final_df_ddgb <- pdz3_final_df_ddg %>% filter(trait_name == "Binding") %>% 
  dplyr::rename(b_ddg_pred = ddg,
                b_ddg_pred_sd = std_ddg)
nrow(pdz3_final_df_ddgb)
## [1] 1567
pdz3_final_df_ddg_2plot <- merge(pdz3_final_df_ddgf, pdz3_final_df_ddgb, by = "id_eve", all.x = TRUE)
dim(pdz3_final_df_ddg_2plot)
## [1] 1587   81
# Step 1: Calculate z-scores and p-values for stabilizing mutations
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(z_stab = (f_ddg_pred - 0) / f_ddg_pred_sd)

pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(p_stab = pnorm(z_stab))

# Step 2: Calculate z-scores and p-values for destabilizing mutations
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(z_destab = (f_ddg_pred - 0.5) / f_ddg_pred_sd,
         p_destab = 1 - pnorm(z_destab))

# Step 3: Classify mutations based on p-values
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(class = case_when(
    p_stab < 0.05 ~ "stabilizing",
    p_destab < 0.05 ~ "destabilizing",
    TRUE ~ "no change"
  ))

# Print summary of classification counts
class_summary <- table(pdz3_final_df_ddg_2plot$class)
print(class_summary)
## 
## destabilizing     no change   stabilizing 
##           514          1042            31
#destabilizing     no change   stabilizing 
#514          1042            31 

paper_anno <- read.csv("../data/cleaned_ddg/pdz3_ddg_cleaned.csv")
head(paper_anno)
##   id_eve id_am pos_am id_old Pos mut_order   f_dg_pred f_ddg_pred f_ddg_pred_sd
## 1  A386C A343C    343   A33C  33         1 -0.39296590  0.9164153    0.08446857
## 2  A386D A343D    343   A33D  33         1 -1.07504355  0.2343376    0.07533930
## 3  A386E A343E    343   A33E  33         1 -1.49154622 -0.1821651    0.28006551
## 4  A386F A343F    343   A33F  33         1 -0.06578612  1.2435951    0.05578810
## 5  A386G A343G    343   A33G  33         1 -0.50514604  0.8042351    0.04267052
## 6  A386I A343I    343   A33I  33         1 -0.63955357  0.6698276    0.03671739
##    b_dg_pred   b_ddg_pred b_ddg_pred_sd f_ddg_pred_conf b_ddg_pred_conf
## 1 -0.7049648  0.001690949    0.04058391            TRUE            TRUE
## 2 -0.6161306  0.090525109    0.03136712            TRUE            TRUE
## 3 -0.7094793 -0.002823546    0.03873099           FALSE            TRUE
## 4 -0.6698994  0.036756306    0.04147284            TRUE            TRUE
## 5 -0.8211448 -0.114489064    0.02278570            TRUE            TRUE
## 6 -0.5572302  0.149425538    0.12483570            TRUE            TRUE
##   HAmin_ligand scHAmin_ligand RSASA   SS Pos_class    protein WT_AA Mut
## 1     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   C
## 2     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   D
## 3     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   E
## 4     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   F
## 5     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   G
## 6     10.10073       12.45975  0.87 <NA>   surface PSD95-PDZ3     A   I
##   b_ddg_wposmeanabs b_ddg_wposse allosteric orthosteric allosteric_mutation
## 1         0.1079272  0.009094901         NA          NA               FALSE
## 2         0.1079272  0.009094901         NA          NA               FALSE
## 3         0.1079272  0.009094901         NA          NA               FALSE
## 4         0.1079272  0.009094901         NA          NA               FALSE
## 5         0.1079272  0.009094901         NA          NA               FALSE
## 6         0.1079272  0.009094901         NA          NA               FALSE
##   wt_aa.x mt_aa pos_eve wt_aa.y      atom consurf color confidence_interval
## 1       A     C     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
## 2       A     D     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
## 3       A     E     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
## 4       A     F     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
## 5       A     G     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
## 6       A     I     386       A ALA:343:A  -0.451     7 -0.588, -0.348  8,7
##   msa_data                   RESIDUE.VARIETY     AM AM_logit   ESM1b popEVE
## 1  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.9283 2.560864 -15.589 -4.577
## 2  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.9881 4.419246 -16.408 -4.690
## 3  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.9759 3.701148 -13.101 -4.522
## 4  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.9752 3.671799 -16.444 -5.093
## 5  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.7414 1.053258 -11.524 -4.008
## 6  150/150 A 94%, S  2%, T  2%, P <1%, G <1% 0.9837 4.100156 -15.306 -4.718
##     ESM1v   EVE
## 1 -10.991 6.604
## 2 -11.585 6.652
## 3 -10.914 6.104
## 4 -13.794 6.533
## 5  -7.404 6.073
## 6 -11.800 6.545
table(paper_anno$orthosteric)
## 
## TRUE 
##  113
ortho_list <- unique(paper_anno %>% filter(orthosteric == TRUE) %>% pull(pos_am))
##setdiff(unique(pdz3_out %>% filter(scHAmin_ligand < 4.3) %>% pull(Pos)), unique(merged_df_pdz3 %>% filter(orthosteric == TRUE) %>% pull (Pos_ref.x)))
ortho_list <- append(ortho_list, c(322, 323, 324 ,325, 326, 327, 328, 331, 339, 372, 376, 379, 380))

head(pdz3_final_df_ddg_2plot)
##   id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1  A386C 595  596     A33C       33      Folding  0.6659222     0.1989811
## 2  A386D 596  597     A33D       33      Folding  0.1566707     0.1977913
## 3  A386E 597  598     A33E       33      Folding -0.2662460     0.4344191
## 4  A386F 598  599     A33F       33      Folding  1.0041094     0.2397002
## 5  A386G 599  600     A33G       33      Folding  0.5761733     0.2151606
## 6  A386I 600  601     A33I       33      Folding  0.4137276     0.1953078
##   ci95_kcal.mol.x     library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1       0.7800060 761_abundance folding   761  DLG4_PDZ3                  V53
## 2       0.7753419 761_abundance folding   761  DLG4_PDZ3                  V53
## 3       1.7029230 761_abundance folding   761  DLG4_PDZ3                  V53
## 4       0.9396246 761_abundance folding   761  DLG4_PDZ3                  V53
## 5       0.8434296 761_abundance folding   761  DLG4_PDZ3                  V53
## 6       0.7656068 761_abundance folding   761  DLG4_PDZ3                  V53
##   WT_aa.x structural_alignment_pos.x    consv.x HAmin_ligand.x scHAmin_ligand.x
## 1       A                         53 0.02246867       10.57668          12.9716
## 2       A                         53 0.02246867       10.57668          12.9716
## 3       A                         53 0.02246867       10.57668          12.9716
## 4       A                         53 0.02246867       10.57668          12.9716
## 5       A                         53 0.02246867       10.57668          12.9716
## 6       A                         53 0.02246867       10.57668          12.9716
##      X1.x     X2.x     X3.x    X4.x    X5.x    X6.x     X7.x     X8.x    X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
##   Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1       343                    9                   V                  FALSE
## 2       343                    9                   V                  FALSE
## 3       343                    9                   V                  FALSE
## 4       343                    9                   V                  FALSE
## 5       343                    9                   V                  FALSE
## 6       343                    9                   V                  FALSE
##   WT_aa.1.x secondary_structure.x   rsasa.x structure_location.x wt_aa.x
## 1         A                  loop 0.8103598              surface       A
## 2         A                  loop 0.8103598              surface       A
## 3         A                  loop 0.8103598              surface       A
## 4         A                  loop 0.8103598              surface       A
## 5         A                  loop 0.8103598              surface       A
## 6         A                  loop 0.8103598              surface       A
##   mt_aa.x pos_eve.x popEVE.x ESM1v.x  X.y V1.y id_old.y pos_am.y trait_name.y
## 1       C       386   -4.577 -10.991 2158 2160     A33C       33      Binding
## 2       D       386   -4.690 -11.585 2159 2161     A33D       33      Binding
## 3       E       386   -4.522 -10.914 2160 2162     A33E       33      Binding
## 4       F       386   -5.093 -13.794 2161 2163     A33F       33      Binding
## 5       G       386   -4.008  -7.404 2162 2164     A33G       33      Binding
## 6       I       386   -4.718 -11.800 2163 2165     A33I       33      Binding
##    b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1  0.11921857    0.06126311       0.2401514   761_808 binding   761  DLG4_PDZ3
## 2 -0.02530033    0.82292100       3.2258503   761_808 binding   761  DLG4_PDZ3
## 3  0.04624840    0.06722154       0.2635084   761_808 binding   761  DLG4_PDZ3
## 4  0.27775264    0.16555277       0.6489668   761_808 binding   761  DLG4_PDZ3
## 5  0.06452813    0.07905913       0.3099118   761_808 binding   761  DLG4_PDZ3
## 6  0.07812631    0.49614970       1.9449068   761_808 binding   761  DLG4_PDZ3
##   alignment_position.y WT_aa.y structural_alignment_pos.y    consv.y
## 1                  V53       A                         53 0.02246867
## 2                  V53       A                         53 0.02246867
## 3                  V53       A                         53 0.02246867
## 4                  V53       A                         53 0.02246867
## 5                  V53       A                         53 0.02246867
## 6                  V53       A                         53 0.02246867
##   HAmin_ligand.y scHAmin_ligand.y    X1.y     X2.y     X3.y    X4.y    X5.y
## 1       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
##      X6.y     X7.y     X8.y    X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716       343                    9
## 2 14.1115 18.51017 13.41275 12.9716       343                    9
## 3 14.1115 18.51017 13.41275 12.9716       343                    9
## 4 14.1115 18.51017 13.41275 12.9716       343                    9
## 5 14.1115 18.51017 13.41275 12.9716       343                    9
## 6 14.1115 18.51017 13.41275 12.9716       343                    9
##   closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1                   V                  FALSE         A                  loop
## 2                   V                  FALSE         A                  loop
## 3                   V                  FALSE         A                  loop
## 4                   V                  FALSE         A                  loop
## 5                   V                  FALSE         A                  loop
## 6                   V                  FALSE         A                  loop
##     rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598              surface       A       C       386   -4.577 -10.991
## 2 0.8103598              surface       A       D       386   -4.690 -11.585
## 3 0.8103598              surface       A       E       386   -4.522 -10.914
## 4 0.8103598              surface       A       F       386   -5.093 -13.794
## 5 0.8103598              surface       A       G       386   -4.008  -7.404
## 6 0.8103598              surface       A       I       386   -4.718 -11.800
##       z_stab    p_stab   z_destab   p_destab         class
## 1  3.3466602 0.9995910  0.8338590 0.20218022     no change
## 2  0.7921010 0.7858491 -1.7358159 0.95870181     no change
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056     no change
## 4  4.1890228 0.9999860  2.1030834 0.01772924 destabilizing
## 5  2.6778755 0.9962955  0.3540300 0.36165821     no change
## 6  2.1183358 0.9829267 -0.4417253 0.67065601     no change
pdz3_final_df_ddg_2plot$orthosteric <- FALSE
pdz3_final_df_ddg_2plot$orthosteric[pdz3_final_df_ddg_2plot$Pos_ref.x %in% ortho_list] <- TRUE

table(pdz3_final_df_ddg_2plot$orthosteric)
## 
## FALSE  TRUE 
##  1341   246
#FALSE  TRUE 
# 1341   246 
pdz3_final_df_ddg_2plot_all <- pdz3_final_df_ddg_2plot
nrow(pdz3_final_df_ddg_2plot_all)
## [1] 1587
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot[!is.na(pdz3_final_df_ddg_2plot$b_ddg_pred), ]
nrow(pdz3_final_df_ddg_2plot) #1567
## [1] 1567
head(pdz3_final_df_ddg_2plot)
##   id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1  A386C 595  596     A33C       33      Folding  0.6659222     0.1989811
## 2  A386D 596  597     A33D       33      Folding  0.1566707     0.1977913
## 3  A386E 597  598     A33E       33      Folding -0.2662460     0.4344191
## 4  A386F 598  599     A33F       33      Folding  1.0041094     0.2397002
## 5  A386G 599  600     A33G       33      Folding  0.5761733     0.2151606
## 6  A386I 600  601     A33I       33      Folding  0.4137276     0.1953078
##   ci95_kcal.mol.x     library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1       0.7800060 761_abundance folding   761  DLG4_PDZ3                  V53
## 2       0.7753419 761_abundance folding   761  DLG4_PDZ3                  V53
## 3       1.7029230 761_abundance folding   761  DLG4_PDZ3                  V53
## 4       0.9396246 761_abundance folding   761  DLG4_PDZ3                  V53
## 5       0.8434296 761_abundance folding   761  DLG4_PDZ3                  V53
## 6       0.7656068 761_abundance folding   761  DLG4_PDZ3                  V53
##   WT_aa.x structural_alignment_pos.x    consv.x HAmin_ligand.x scHAmin_ligand.x
## 1       A                         53 0.02246867       10.57668          12.9716
## 2       A                         53 0.02246867       10.57668          12.9716
## 3       A                         53 0.02246867       10.57668          12.9716
## 4       A                         53 0.02246867       10.57668          12.9716
## 5       A                         53 0.02246867       10.57668          12.9716
## 6       A                         53 0.02246867       10.57668          12.9716
##      X1.x     X2.x     X3.x    X4.x    X5.x    X6.x     X7.x     X8.x    X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
##   Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1       343                    9                   V                  FALSE
## 2       343                    9                   V                  FALSE
## 3       343                    9                   V                  FALSE
## 4       343                    9                   V                  FALSE
## 5       343                    9                   V                  FALSE
## 6       343                    9                   V                  FALSE
##   WT_aa.1.x secondary_structure.x   rsasa.x structure_location.x wt_aa.x
## 1         A                  loop 0.8103598              surface       A
## 2         A                  loop 0.8103598              surface       A
## 3         A                  loop 0.8103598              surface       A
## 4         A                  loop 0.8103598              surface       A
## 5         A                  loop 0.8103598              surface       A
## 6         A                  loop 0.8103598              surface       A
##   mt_aa.x pos_eve.x popEVE.x ESM1v.x  X.y V1.y id_old.y pos_am.y trait_name.y
## 1       C       386   -4.577 -10.991 2158 2160     A33C       33      Binding
## 2       D       386   -4.690 -11.585 2159 2161     A33D       33      Binding
## 3       E       386   -4.522 -10.914 2160 2162     A33E       33      Binding
## 4       F       386   -5.093 -13.794 2161 2163     A33F       33      Binding
## 5       G       386   -4.008  -7.404 2162 2164     A33G       33      Binding
## 6       I       386   -4.718 -11.800 2163 2165     A33I       33      Binding
##    b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1  0.11921857    0.06126311       0.2401514   761_808 binding   761  DLG4_PDZ3
## 2 -0.02530033    0.82292100       3.2258503   761_808 binding   761  DLG4_PDZ3
## 3  0.04624840    0.06722154       0.2635084   761_808 binding   761  DLG4_PDZ3
## 4  0.27775264    0.16555277       0.6489668   761_808 binding   761  DLG4_PDZ3
## 5  0.06452813    0.07905913       0.3099118   761_808 binding   761  DLG4_PDZ3
## 6  0.07812631    0.49614970       1.9449068   761_808 binding   761  DLG4_PDZ3
##   alignment_position.y WT_aa.y structural_alignment_pos.y    consv.y
## 1                  V53       A                         53 0.02246867
## 2                  V53       A                         53 0.02246867
## 3                  V53       A                         53 0.02246867
## 4                  V53       A                         53 0.02246867
## 5                  V53       A                         53 0.02246867
## 6                  V53       A                         53 0.02246867
##   HAmin_ligand.y scHAmin_ligand.y    X1.y     X2.y     X3.y    X4.y    X5.y
## 1       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
##      X6.y     X7.y     X8.y    X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716       343                    9
## 2 14.1115 18.51017 13.41275 12.9716       343                    9
## 3 14.1115 18.51017 13.41275 12.9716       343                    9
## 4 14.1115 18.51017 13.41275 12.9716       343                    9
## 5 14.1115 18.51017 13.41275 12.9716       343                    9
## 6 14.1115 18.51017 13.41275 12.9716       343                    9
##   closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1                   V                  FALSE         A                  loop
## 2                   V                  FALSE         A                  loop
## 3                   V                  FALSE         A                  loop
## 4                   V                  FALSE         A                  loop
## 5                   V                  FALSE         A                  loop
## 6                   V                  FALSE         A                  loop
##     rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598              surface       A       C       386   -4.577 -10.991
## 2 0.8103598              surface       A       D       386   -4.690 -11.585
## 3 0.8103598              surface       A       E       386   -4.522 -10.914
## 4 0.8103598              surface       A       F       386   -5.093 -13.794
## 5 0.8103598              surface       A       G       386   -4.008  -7.404
## 6 0.8103598              surface       A       I       386   -4.718 -11.800
##       z_stab    p_stab   z_destab   p_destab         class orthosteric
## 1  3.3466602 0.9995910  0.8338590 0.20218022     no change       FALSE
## 2  0.7921010 0.7858491 -1.7358159 0.95870181     no change       FALSE
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056     no change       FALSE
## 4  4.1890228 0.9999860  2.1030834 0.01772924 destabilizing       FALSE
## 5  2.6778755 0.9962955  0.3540300 0.36165821     no change       FALSE
## 6  2.1183358 0.9829267 -0.4417253 0.67065601     no change       FALSE
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>% dplyr::select(id_eve, f_ddg_pred, f_ddg_pred_sd,
                                                                     ESM1v.x, b_ddg_pred, b_ddg_pred_sd,
                                                                     orthosteric,class) %>%
  dplyr::rename(ESM1v = ESM1v.x)

pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
mean(pdz3_final_df_ddg_2plot_active$b_ddg_pred) #0.837005
## [1] 0.837005
pdz3_final_df_ddg_2plot$allosteric <- FALSE
pdz3_final_df_ddg_2plot$allosteric[pdz3_final_df_ddg_2plot$b_ddg_pred >= mean(pdz3_final_df_ddg_2plot_active$b_ddg_pred)] <- TRUE
pdz3_final_df_ddg_2plot$allosteric[pdz3_final_df_ddg_2plot$orthosteric == TRUE] <- FALSE
table(pdz3_final_df_ddg_2plot$allosteric)
## 
## FALSE  TRUE 
##  1202   365
head(pdz3_final_df_ddg_2plot)
##   id_eve f_ddg_pred f_ddg_pred_sd   ESM1v  b_ddg_pred b_ddg_pred_sd orthosteric
## 1  A386C  0.6659222     0.1989811 -10.991  0.11921857    0.06126311       FALSE
## 2  A386D  0.1566707     0.1977913 -11.585 -0.02530033    0.82292100       FALSE
## 3  A386E -0.2662460     0.4344191 -10.914  0.04624840    0.06722154       FALSE
## 4  A386F  1.0041094     0.2397002 -13.794  0.27775264    0.16555277       FALSE
## 5  A386G  0.5761733     0.2151606  -7.404  0.06452813    0.07905913       FALSE
## 6  A386I  0.4137276     0.1953078 -11.800  0.07812631    0.49614970       FALSE
##           class allosteric
## 1     no change      FALSE
## 2     no change      FALSE
## 3     no change      FALSE
## 4 destabilizing      FALSE
## 5     no change      FALSE
## 6     no change      FALSE
##########
# Function to bootstrap adjusted R-squared
bootstrap_adjusted_r_squared <- function(input_data, model_id, n_bootstrap = 1000) {
  
  # Initialize a vector to store the adjusted R-squared values from each bootstrap
  adjusted_r_squared_values <- numeric(n_bootstrap)
  
  # Define the number of rows (n)
  n <- nrow(input_data)
  
  # Number of predictors (p) - In your case, there are two predictors: f_ddg_pred and sos1_ddg
  p <- 2
  
  # Total sum of squares (SST) for the original data
  ss_total <- sum((input_data$ESM1v - mean(input_data$ESM1v))^2)
  
  # Perform the bootstrapping
  for (k in 1:n_bootstrap) {
    # Sample indices with replacement
    indices <- sample(1:n, size = n, replace = TRUE)
    
    # Create bootstrap samples for actual and predicted values using the sampled indices
    y_bootstrap <- input_data$ESM1v[indices]
    y_hat_bootstrap <- input_data[[paste0("predicted_", model_id)]][indices]
    
    # Residual sum of squares (SSR) for the bootstrap sample
    ss_residual <- sum((y_bootstrap - y_hat_bootstrap)^2)
    
    # Calculate regular R-squared for this bootstrap sample
    r_squared <- 1 - (ss_residual / ss_total)
    
    # Calculate adjusted R-squared for this bootstrap sample
    adjusted_r_squared_values[k] <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
  }
  
  # Sort the adjusted R-squared values
  adjusted_r_squared_sorted <- sort(adjusted_r_squared_values)
  
  # Get the 95% confidence intervals
  lower_ci <- adjusted_r_squared_sorted[round(n_bootstrap * 0.025)]
  upper_ci <- adjusted_r_squared_sorted[round(n_bootstrap * 0.975)]
  
  # Return the confidence intervals and the adjusted R-squared values
  return(list(lower_ci = lower_ci, upper_ci = upper_ci, adjusted_r_squared_values = adjusted_r_squared_values))
}

fit_model_and_diagnostics_ddgf <- function(input_data, model_id) {
  
  # Fit the linear regression model
  fit <- lm(ESM1v ~ f_ddg_pred, data = input_data)
  
  # Calculate predictions and residuals
  input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
  input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
  
  # R-squared and Adjusted R-squared Calculation
  r_squared <- summary(fit)$r.squared
  adjusted_r_squared <- summary(fit)$adj.r.squared
  
  # Bootstrapping for adjusted R-squared, RMSE, and AIC
  bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
  
  # Print results
  print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
  print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
  
  # Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
  return(list(
    fit = fit, 
    adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,  
    adjusted_r_squared = adjusted_r_squared,
    adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
    input_data = input_data
  ))
}

fit_model_and_diagnostics_ddga <- function(input_data, model_id) {
  
  # Fit the linear regression model
  fit <- lm(ESM1v ~ b_ddg_pred, data = input_data)
  
  # Calculate predictions and residuals
  input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
  input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
  
  # R-squared and Adjusted R-squared Calculation
  r_squared <- summary(fit)$r.squared
  adjusted_r_squared <- summary(fit)$adj.r.squared
  
  # Bootstrapping for adjusted R-squared, RMSE, and AIC
  bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
  
  # Print results
  print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
  print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
  
  # Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
  return(list(
    fit = fit, 
    adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,  
    adjusted_r_squared = adjusted_r_squared,
    adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
    input_data = input_data
  ))
}

fit_model_and_diagnostics_both <- function(input_data, model_id) {
  
  # Fit the linear regression model
  fit <- lm(ESM1v ~ f_ddg_pred + b_ddg_pred, data = input_data)
  
  # Calculate predictions and residuals
  input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
  input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
  
  # R-squared and Adjusted R-squared Calculation
  r_squared <- summary(fit)$r.squared
  adjusted_r_squared <- summary(fit)$adj.r.squared
  
  # Bootstrapping for adjusted R-squared, RMSE, and AIC
  bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
  
  # Print results
  print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
  print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
  
  # Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
  return(list(
    fit = fit, 
    adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,  
    adjusted_r_squared = adjusted_r_squared,
    adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
    input_data = input_data
  ))
}
############################# MODEL 0: all mutations, ddGf + ddGa #############################
set.seed(11)
# Usage example
m0_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot, model_id = "m0")
## [1] "Adjusted R-squared: 0.3273"
## [1] "Adjusted R-squared 95% CI: 0.2749 - 0.375"
m1_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot, model_id = "m1")
## [1] "Adjusted R-squared: 0.2147"
## [1] "Adjusted R-squared 95% CI: 0.1525 - 0.2681"
m2_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot, model_id = "m2")
## [1] "Adjusted R-squared: 0.2679"
## [1] "Adjusted R-squared 95% CI: 0.207 - 0.3224"
############################# MODEL 3: non-active mutations, ddGf + ddGa #############################
pdz3_final_df_ddg_2plot_non_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)

m3_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Adjusted R-squared: 0.3641"
## [1] "Adjusted R-squared 95% CI: 0.3123 - 0.4169"
m4_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Adjusted R-squared: 0.2568"
## [1] "Adjusted R-squared 95% CI: 0.1968 - 0.3145"
m5_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m5")
## [1] "Adjusted R-squared: 0.3009"
## [1] "Adjusted R-squared 95% CI: 0.2397 - 0.3572"
############################# MODEL 6: active mutations, ddGf + ddGa #############################
pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)

m6_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Adjusted R-squared: 0.1165"
## [1] "Adjusted R-squared 95% CI: -0.03 - 0.2523"
m7_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Adjusted R-squared: 0.1155"
## [1] "Adjusted R-squared 95% CI: -0.037 - 0.2582"
m8_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m8")
## [1] "Adjusted R-squared: 0.1802"
## [1] "Adjusted R-squared 95% CI: 0.0403 - 0.308"
anova(m0_results$fit, m1_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   1564 18199                                  
## 2   1565 21260 -1   -3061.2 263.07 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred
#   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
# 1   1564 18199                                  
# 2   1565 21260 -1   -3061.2 263.07 < 2.2e-16 ***
anova(m4_results$fit, m3_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   1323 15721                                  
## 2   1322 13441  1    2279.3 224.18 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
#   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
# 1   1323 15721                                  
# 2   1322 13441  1    2279.3 224.18 < 2.2e-16 ***
anova(m6_results$fit, m8_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    240 1845.0                                  
## 2    239 1704.8  1    140.21 19.657 1.413e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
#   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
# 1    240 1845.0                                  
# 2    239 1704.8  1    140.21 19.657 1.413e-05 ***
# Step 1: Build raw data (same as before)
plot_data <- data.frame(
  category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3000),
  type = rep(c("abundance", "activity", "both"), times = 3, each = 1000),
  adj_r2_values = c(
    m1_results$adj_r2_values, m2_results$adj_r2_values, m0_results$adj_r2_values,
    m4_results$adj_r2_values, m5_results$adj_r2_values, m3_results$adj_r2_values,
    m6_results$adj_r2_values, m7_results$adj_r2_values, m8_results$adj_r2_values
  )
)

# Step 2: Build CI summary table
ci_summary <- data.frame(
  category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3),
  type = rep(c("abundance", "activity", "both"), times = 3),
  mean_adj_r2 = c(
    mean(m1_results$adj_r2_values), mean(m2_results$adj_r2_values), m0_results$adjusted_r_squared,
    mean(m4_results$adj_r2_values), mean(m5_results$adj_r2_values), mean(m3_results$adj_r2_values),
    mean(m6_results$adj_r2_values), mean(m7_results$adj_r2_values), mean(m8_results$adj_r2_values)
  ),
  ci_lower = c(
    quantile(m1_results$adj_r2_values, 0.025), quantile(m2_results$adj_r2_values, 0.025), m0_results$adjusted_r_squared_ci[1],
    quantile(m4_results$adj_r2_values, 0.025), quantile(m5_results$adj_r2_values, 0.025), quantile(m3_results$adj_r2_values, 0.025),
    quantile(m6_results$adj_r2_values, 0.025), quantile(m7_results$adj_r2_values, 0.025), quantile(m8_results$adj_r2_values, 0.025)
  ),
  ci_upper = c(
    quantile(m1_results$adj_r2_values, 0.975), quantile(m2_results$adj_r2_values, 0.975), m0_results$adjusted_r_squared_ci[2],
    quantile(m4_results$adj_r2_values, 0.975), quantile(m5_results$adj_r2_values, 0.975), quantile(m3_results$adj_r2_values, 0.975),
    quantile(m6_results$adj_r2_values, 0.975), quantile(m7_results$adj_r2_values, 0.975), quantile(m8_results$adj_r2_values, 0.975)
  )
)

# Factor levels for consistency
plot_data$category <- factor(plot_data$category, levels = c("whole protein", "non-orthosteric", "orthosteric"))
plot_data$type <- factor(plot_data$type, levels = c("abundance", "activity", "both"))
ci_summary$category <- factor(ci_summary$category, levels = levels(plot_data$category))
ci_summary$type <- factor(ci_summary$type, levels = levels(plot_data$type))

# Step 3: Plot using CI summary for bars
fig3E <- ggplot(ci_summary, aes(x = type, y = mean_adj_r2, fill = type)) +
  geom_col(width = 0.6, alpha = 0.85) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black") +
  geom_text(aes(label = sprintf("%.3f", mean_adj_r2)), vjust = -1.5, size = 5, color = "black") +
  facet_wrap(~category) +
  scale_fill_brewer(palette = "Set2") +
  theme_classic() +
  theme(
    legend.position = "none",
    axis.line = element_line(linewidth = 0.6),
    strip.text = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 16, face = "bold", angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    axis.title.y = element_text(size = 18),
    plot.title = element_text(size = 18, face = "bold"),
    plot.subtitle = element_text(size = 16, face = "italic")
  ) +
  labs(
    title = "PSD-95-PDZ3", 
    subtitle = "Linear regression against ESM1v", 
    x = "", 
    y = "Adjusted R²"
  ) +
  # Significance comparisons using full plot_data
  geom_signif(data = subset(plot_data, category == "whole protein"),
              aes(x = type, y = adj_r2_values),
              comparisons = list( c("abundance", "both")),
              annotations = "p < 2.2e-16",
              y_position = c( 0.65),
              tip_length = 0.02,
              step_increase = 0.05,
              test = "wilcox.test",
              textsize = 5) +
  geom_signif(data = subset(plot_data, category == "non-orthosteric"),
              aes(x = type, y = adj_r2_values),
              comparisons = list( c("abundance", "both")),
              annotations = "p < 2.2e-16",
              y_position = c(0.65),
              tip_length = 0.02,
              step_increase = 0.05,
              test = "wilcox.test",
              textsize = 5) +
  geom_signif(data = subset(plot_data, category == "orthosteric"),
              aes(x = type, y = adj_r2_values),
              comparisons = list( c("abundance", "both")),
              annotations = "p = 1.413e-05",
              y_position = c(0.65),
              step_increase = 0.05,
              test = "wilcox.test",
              textsize = 5) +
  ylim(-0.1, 1)
## Warning in geom_signif(data = subset(plot_data, category == "whole protein"), :
## You have set data and mapping, are you sure that manual = FALSE is correct?
## Warning in geom_signif(data = subset(plot_data, category == "non-orthosteric"),
## : You have set data and mapping, are you sure that manual = FALSE is correct?
## Warning in geom_signif(data = subset(plot_data, category == "orthosteric"), :
## You have set data and mapping, are you sure that manual = FALSE is correct?
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_barplot.pdf", 
       plot = fig3E, width = 8, height = 4, dpi = 300)
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_signif()`).
fig3E
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_signif()`).

doubledeepms__minimum_interchain_distances_from_PDB_perres <- function(
    input_file,
  chain_query = "A",
  chain_target = "B"
  ){
  
  #load PDB structure
#   sink(file = "/dev/null")
  pdb <- bio3d::read.pdb(input_file, rm.alt = TRUE)
#   sink()

  ### Atom selections
  ###########################

    #Protein atoms
  sele_protein <- bio3d::atom.select(pdb, "protein", verbose=FALSE)
    #Hydrogen atoms
  sele_H <-bio3d::atom.select(pdb, "h", verbose=FALSE)
    #Water atoms
  sele_water <- bio3d::atom.select(pdb, "water", verbose=FALSE)
    #Side chain atoms
  sele_sc <- bio3d::atom.select(pdb, "sidechain", verbose=FALSE)
    #C-alpha atoms
  sele_ca <- bio3d::atom.select(pdb, "calpha", verbose=FALSE)
    #Glycine c-alpha atoms
  sele_glyca <- bio3d::atom.select(pdb, resid = "GLY", string = "calpha", verbose=FALSE)

  ### Combine atom selections
  ###########################

  #Heavy atoms
    sele_HA <- bio3d::combine.select(sele_protein, sele_H, sele_water, operator = "-", verbose=FALSE)

  #Side chain heavy atoms + c-alpha for glycine
    sele_prot_sc <- bio3d::combine.select(sele_protein, sele_sc, operator = "AND", verbose=FALSE)
    sele_prot_sc_glyca <- bio3d::combine.select(sele_prot_sc, sele_glyca, operator = "OR", verbose=FALSE)
    sele_scHA <- bio3d::combine.select(sele_prot_sc_glyca, sele_H, sele_water, operator = "-", verbose=FALSE)

    #List
    sele_list <- list(
        "HA" = sele_HA,
        "scHA" = sele_scHA)

  ### Calculate minimum target chain distances
  ###########################

    result_dt <- data.table()
  for(metric in names(sele_list)){
      #Distance matrix
        pdb_sub <- bio3d::trim.pdb(pdb, sele_list[[metric]])
      dist_mat <- bio3d::dm.xyz(pdb_sub$xyz, grpby=apply(pdb_sub$atom[,c("resno", "chain")], 1, paste, collapse = "_"), scut=0, mask.lower = FALSE)
      resno_sub <- unique(pdb_sub$atom[,c("resno", "chain")])
      #Ligand distance matrix
      ligand_dist <- dist_mat[resno_sub[,"chain"]==chain_query,resno_sub[,"chain"]==chain_target]
      ligand_dist_df <- ligand_dist
      colnames(ligand_dist_df) <- resno_sub[resno_sub[,"chain"]==chain_target,"resno"]
      rownames(ligand_dist_df) <- resno_sub[resno_sub[,"chain"]==chain_query,"resno"]
      #chain target names
      #Absolute residue number
      ligand_dist_dt <- data.table(Pos = resno_sub[resno_sub[,"chain"]==chain_query,"resno"])
      #Minimum ligand distance
      ligand_dist_dt[, min_dist := apply(ligand_dist, 1, min)]
      names(ligand_dist_dt)[2] <- paste0(metric, "min_ligand")
      if(nrow(result_dt)==0){
        result_dt <- ligand_dist_dt
      }else{
        result_dt <- merge(result_dt, ligand_dist_dt, by = "Pos", all = T)
      }
     result_dt_with_ligand_matrix <- cbind(result_dt, ligand_dist_df)
  }

  #Return
    return(result_dt_with_ligand_matrix)

}

pdz3_out <- doubledeepms__minimum_interchain_distances_from_PDB_perres(
  input_file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9.pdb",
  chain_query = "A",
  chain_target = "B"
)

pdz3_out
## Key: <Pos>
##        Pos HAmin_ligand scHAmin_ligand        5        6        7        8
##      <int>        <num>          <num>    <num>    <num>    <num>    <num>
##   1:   301     27.81682       29.29812 36.60292 29.29812 34.28569 31.58151
##   2:   302     24.77413       24.77413 32.59924 25.50251 29.70773 26.95297
##   3:   303     23.61197       23.88436 30.97802 23.88436 29.30675 27.23208
##   4:   304     25.56612       27.36444 34.43185 27.36444 33.26058 31.46158
##   5:   305     24.06314       26.75911 32.99787 26.75911 31.21189 29.67537
##  ---                                                                      
## 111:   411     16.60658       18.35828 25.27723 18.35828 24.70556 23.51490
## 112:   412     11.83225       11.83225 18.16193 11.83225 17.95221 17.84223
## 113:   413     14.18169       16.69390 23.98139 16.69390 24.61715 23.71109
## 114:   414     11.22331       11.22331 18.87227 11.22331 19.65124 18.73224
## 115:   415     14.62439       16.80815 23.80095 16.80815 25.75406 24.65175
##             9
##         <num>
##   1: 29.91678
##   2: 24.77413
##   3: 25.79389
##   4: 29.93395
##   5: 26.89401
##  ---         
## 111: 22.07162
## 112: 16.53124
## 113: 23.90523
## 114: 19.91609
## 115: 26.47222
pdz3_final_df_ddg_2plot_out <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)

# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = pdz3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model 
pdz3_final_df_ddg_2plot_all$fitted <- predict(loess_fit, newdata = pdz3_final_df_ddg_2plot_all)

# Calculate residuals for ALL points
pdz3_final_df_ddg_2plot_all$residuals <- pdz3_final_df_ddg_2plot_all$ESM1v.x - pdz3_final_df_ddg_2plot_all$fitted

head(pdz3_final_df_ddg_2plot_all)
##   id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1  A386C 595  596     A33C       33      Folding  0.6659222     0.1989811
## 2  A386D 596  597     A33D       33      Folding  0.1566707     0.1977913
## 3  A386E 597  598     A33E       33      Folding -0.2662460     0.4344191
## 4  A386F 598  599     A33F       33      Folding  1.0041094     0.2397002
## 5  A386G 599  600     A33G       33      Folding  0.5761733     0.2151606
## 6  A386I 600  601     A33I       33      Folding  0.4137276     0.1953078
##   ci95_kcal.mol.x     library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1       0.7800060 761_abundance folding   761  DLG4_PDZ3                  V53
## 2       0.7753419 761_abundance folding   761  DLG4_PDZ3                  V53
## 3       1.7029230 761_abundance folding   761  DLG4_PDZ3                  V53
## 4       0.9396246 761_abundance folding   761  DLG4_PDZ3                  V53
## 5       0.8434296 761_abundance folding   761  DLG4_PDZ3                  V53
## 6       0.7656068 761_abundance folding   761  DLG4_PDZ3                  V53
##   WT_aa.x structural_alignment_pos.x    consv.x HAmin_ligand.x scHAmin_ligand.x
## 1       A                         53 0.02246867       10.57668          12.9716
## 2       A                         53 0.02246867       10.57668          12.9716
## 3       A                         53 0.02246867       10.57668          12.9716
## 4       A                         53 0.02246867       10.57668          12.9716
## 5       A                         53 0.02246867       10.57668          12.9716
## 6       A                         53 0.02246867       10.57668          12.9716
##      X1.x     X2.x     X3.x    X4.x    X5.x    X6.x     X7.x     X8.x    X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
##   Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1       343                    9                   V                  FALSE
## 2       343                    9                   V                  FALSE
## 3       343                    9                   V                  FALSE
## 4       343                    9                   V                  FALSE
## 5       343                    9                   V                  FALSE
## 6       343                    9                   V                  FALSE
##   WT_aa.1.x secondary_structure.x   rsasa.x structure_location.x wt_aa.x
## 1         A                  loop 0.8103598              surface       A
## 2         A                  loop 0.8103598              surface       A
## 3         A                  loop 0.8103598              surface       A
## 4         A                  loop 0.8103598              surface       A
## 5         A                  loop 0.8103598              surface       A
## 6         A                  loop 0.8103598              surface       A
##   mt_aa.x pos_eve.x popEVE.x ESM1v.x  X.y V1.y id_old.y pos_am.y trait_name.y
## 1       C       386   -4.577 -10.991 2158 2160     A33C       33      Binding
## 2       D       386   -4.690 -11.585 2159 2161     A33D       33      Binding
## 3       E       386   -4.522 -10.914 2160 2162     A33E       33      Binding
## 4       F       386   -5.093 -13.794 2161 2163     A33F       33      Binding
## 5       G       386   -4.008  -7.404 2162 2164     A33G       33      Binding
## 6       I       386   -4.718 -11.800 2163 2165     A33I       33      Binding
##    b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1  0.11921857    0.06126311       0.2401514   761_808 binding   761  DLG4_PDZ3
## 2 -0.02530033    0.82292100       3.2258503   761_808 binding   761  DLG4_PDZ3
## 3  0.04624840    0.06722154       0.2635084   761_808 binding   761  DLG4_PDZ3
## 4  0.27775264    0.16555277       0.6489668   761_808 binding   761  DLG4_PDZ3
## 5  0.06452813    0.07905913       0.3099118   761_808 binding   761  DLG4_PDZ3
## 6  0.07812631    0.49614970       1.9449068   761_808 binding   761  DLG4_PDZ3
##   alignment_position.y WT_aa.y structural_alignment_pos.y    consv.y
## 1                  V53       A                         53 0.02246867
## 2                  V53       A                         53 0.02246867
## 3                  V53       A                         53 0.02246867
## 4                  V53       A                         53 0.02246867
## 5                  V53       A                         53 0.02246867
## 6                  V53       A                         53 0.02246867
##   HAmin_ligand.y scHAmin_ligand.y    X1.y     X2.y     X3.y    X4.y    X5.y
## 1       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6       10.57668          12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
##      X6.y     X7.y     X8.y    X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716       343                    9
## 2 14.1115 18.51017 13.41275 12.9716       343                    9
## 3 14.1115 18.51017 13.41275 12.9716       343                    9
## 4 14.1115 18.51017 13.41275 12.9716       343                    9
## 5 14.1115 18.51017 13.41275 12.9716       343                    9
## 6 14.1115 18.51017 13.41275 12.9716       343                    9
##   closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1                   V                  FALSE         A                  loop
## 2                   V                  FALSE         A                  loop
## 3                   V                  FALSE         A                  loop
## 4                   V                  FALSE         A                  loop
## 5                   V                  FALSE         A                  loop
## 6                   V                  FALSE         A                  loop
##     rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598              surface       A       C       386   -4.577 -10.991
## 2 0.8103598              surface       A       D       386   -4.690 -11.585
## 3 0.8103598              surface       A       E       386   -4.522 -10.914
## 4 0.8103598              surface       A       F       386   -5.093 -13.794
## 5 0.8103598              surface       A       G       386   -4.008  -7.404
## 6 0.8103598              surface       A       I       386   -4.718 -11.800
##       z_stab    p_stab   z_destab   p_destab         class orthosteric
## 1  3.3466602 0.9995910  0.8338590 0.20218022     no change       FALSE
## 2  0.7921010 0.7858491 -1.7358159 0.95870181     no change       FALSE
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056     no change       FALSE
## 4  4.1890228 0.9999860  2.1030834 0.01772924 destabilizing       FALSE
## 5  2.6778755 0.9962955  0.3540300 0.36165821     no change       FALSE
## 6  2.1183358 0.9829267 -0.4417253 0.67065601     no change       FALSE
##       fitted  residuals
## 1 -10.329862 -0.6611377
## 2  -8.317484 -3.2675161
## 3  -8.064120 -2.8498800
## 4 -11.402020 -2.3919800
## 5  -9.986755  2.5827550
## 6  -9.214440 -2.5855602
median_residuals <- pdz3_final_df_ddg_2plot_all %>%
  dplyr::group_by(Pos_ref.x) %>%
  summarise(median_residuals = median(residuals, na.rm = TRUE))

min(median_residuals$median_residuals) #-7.508073
## [1] -7.508073
max(median_residuals$median_residuals) #7.583395
## [1] 7.583395
#color byattribute a:bfactor #10 & sel target csab palette -8,red:0,white:8,blue

library(bio3d)
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9.pdb")
data <- median_residuals
head(data)
## # A tibble: 6 × 2
##   Pos_ref.x median_residuals
##       <dbl>            <dbl>
## 1       311           -1.24 
## 2       312            0.247
## 3       313            0.659
## 4       314            1.81 
## 5       315            4.73 
## 6       316            2.61
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b

# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
  position <- data$Pos_ref.x[i]
  correlation_value <- data$median_residuals[i]

  # Find indices in the PDB that match the current position
  indices <- which(pdb$atom$resno == position)

  # Print the indices and current B-factors before updating
  #cat("Updating residue number:", position, "\n")
  #cat("Indices in PDB:", indices, "\n")
  #cat("Current B-factors:", new_b_factors[indices], "\n")

  # Update B-factors for all atoms in the current residue
  new_b_factors[indices] <- correlation_value

  # Print the new B-factors after updating
  #cat("Updated B-factors:", new_b_factors[indices], "\n")
  #cat("\n")  # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$Pos_ref.x))
non_matching_indices
##   [1]    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
##  [16]   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30
##  [31]   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45
##  [46]   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60
##  [61]   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75
##  [76]   76   77   78  695  696  697  698  699  700  701  702  703  704  705  706
##  [91]  707  708  709  710  711  712  713  714  715  716  717  718  719  720  721
## [106]  722  723  724  725  726  727  728  729  730  731  732  733  734  735  736
## [121]  737  738  739  740  741  742  743  744  745  746  747  748  749  750  751
## [136]  752  753  754  755  756  757  758  759  760  761  762  763  764  765  766
## [151]  767  768  769  770  771  772  773  774  775  776  777  778  779  780  781
## [166]  782  783  784  785  786  787  788  789  790  791  792  793  794  795  796
## [181]  797  798  799  800  801  802  803  804  805  806  807  808  809  810  811
## [196]  812  813  814  815  816  817  818  819  820  821  822  823  824  825  826
## [211]  827  828  829  830  831  832  833  834  835  836  837  838  839  840  841
## [226]  842  843  844  845  846  847  848  849  850  851  852  853  854  855  856
## [241]  857  858  859  860  861  862  863  864  865  866  867  868  869  870  871
## [256]  872  873  874  875  876  877  878  879  880  881  882  883  884  885  886
## [271]  887  888  889  890  891  892  893  894  895  896  897  898  899  900  901
## [286]  902  903  904  905  906  907  908  909  910  911  912  913  914  915  916
## [301]  917  918  919  920  921  922  923  924  925  926  927  928  929  930  931
## [316]  932  933  934  935  936  937  938  939  940  941  942  943  944  945  946
## [331]  947  948  949  950  951  952  953  954  955  956  957  958  959  960  961
## [346]  962  963  964  965  966  967  968  969  970  971  972  973  974  975  976
## [361]  977  978  979  980  981  982  983  984  985  986  987  988  989  990  991
## [376]  992  993  994  995  996  997  998  999 1000 1001 1002 1003 1004 1005 1006
## [391] 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
## [406] 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
## [421] 1037 1038 1039 1040 1041 1042 1043 1044 1045
new_b_factors[non_matching_indices] <- 999

# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9_median_residual_loess.pdb")
library(ggExtra)
head(pdz3_final_df_ddg_2plot)
##   id_eve f_ddg_pred f_ddg_pred_sd   ESM1v  b_ddg_pred b_ddg_pred_sd orthosteric
## 1  A386C  0.6659222     0.1989811 -10.991  0.11921857    0.06126311       FALSE
## 2  A386D  0.1566707     0.1977913 -11.585 -0.02530033    0.82292100       FALSE
## 3  A386E -0.2662460     0.4344191 -10.914  0.04624840    0.06722154       FALSE
## 4  A386F  1.0041094     0.2397002 -13.794  0.27775264    0.16555277       FALSE
## 5  A386G  0.5761733     0.2151606  -7.404  0.06452813    0.07905913       FALSE
## 6  A386I  0.4137276     0.1953078 -11.800  0.07812631    0.49614970       FALSE
##           class allosteric
## 1     no change      FALSE
## 2     no change      FALSE
## 3     no change      FALSE
## 4 destabilizing      FALSE
## 5     no change      FALSE
## 6     no change      FALSE
pdz3_final_df_ddg_2plot_out <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)

# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = pdz3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model 
pdz3_final_df_ddg_2plot$fitted <- predict(loess_fit, newdata = pdz3_final_df_ddg_2plot)

# Calculate residuals for ALL points
pdz3_final_df_ddg_2plot$residuals <- pdz3_final_df_ddg_2plot$ESM1v - pdz3_final_df_ddg_2plot$fitted

sum(is.na(pdz3_final_df_ddg_2plot$residuals))
## [1] 1
sum(is.na(pdz3_final_df_ddg_2plot$fitted))
## [1] 1
range(pdz3_final_df_ddg_2plot$residuals, na.rm = TRUE) #-11.29141  14.24574
## [1] -11.29141  14.24574
range(pdz3_final_df_ddg_2plot$ESM1v) #-20.381   6.127
## [1] -20.381   6.127
range(pdz3_final_df_ddg_2plot$f_ddg_pred) #-1.754627  3.682299
## [1] -1.754627  3.682299
fit_line_df <- data.frame(
  f_ddg_pred = seq(min(0, na.rm = TRUE),
                            max(pdz3_final_df_ddg_2plot_out$f_ddg_pred, na.rm = TRUE),
                            length.out = 200)
)

fit_line_df$ESM1v <- predict(loess_fit, newdata = fit_line_df)

cor.test(pdz3_final_df_ddg_2plot$f_ddg_pred, pdz3_final_df_ddg_2plot$ESM1v, method = "spearman") #-0.4588265 
## Warning in cor.test.default(pdz3_final_df_ddg_2plot$f_ddg_pred,
## pdz3_final_df_ddg_2plot$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  pdz3_final_df_ddg_2plot$f_ddg_pred and pdz3_final_df_ddg_2plot$ESM1v
## S = 935533202, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.4588265
nrow(pdz3_final_df_ddg_2plot)
## [1] 1567
p1 <- ggplot(pdz3_final_df_ddg_2plot, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "PSD-95-PDZ3: All 1,567 Mutations",
    subtitle = "Spearman's rho = -0.46",
    x = "Measured ddGf",
    y = "ESM1v pathogenicity",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                        limits = c(-11.3, 15)) +
  theme(legend.position = "none")

p1

pdz3_final_df_ddg_2plot_int <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(pdz3_final_df_ddg_2plot_int)
## [1] 242
cor.test(pdz3_final_df_ddg_2plot_int$f_ddg_pred, pdz3_final_df_ddg_2plot_int$ESM1v, method = "spearman") #-0.31
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_int$f_ddg_pred,
## pdz3_final_df_ddg_2plot_int$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  pdz3_final_df_ddg_2plot_int$f_ddg_pred and pdz3_final_df_ddg_2plot_int$ESM1v
## S = 3089860, p-value = 1.018e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.3081314
p2 <- ggplot(pdz3_final_df_ddg_2plot_int, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.5) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
  #          inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "242 Orthosteric Mutations",
    subtitle = "Spearman's rho = -0.31",
    x = "Measured ddGf",
    y = "",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                         limits = c(-11.3, 15)) +
  theme(legend.position = "none")

p2

pdz3_final_df_ddg_2plot_allo <- pdz3_final_df_ddg_2plot %>% filter(allosteric == TRUE)
nrow(pdz3_final_df_ddg_2plot_allo)
## [1] 365
cor.test(pdz3_final_df_ddg_2plot_allo$f_ddg_pred, pdz3_final_df_ddg_2plot_allo$ESM1v, method = "spearman") #-0.26
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_allo$f_ddg_pred,
## pdz3_final_df_ddg_2plot_allo$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  pdz3_final_df_ddg_2plot_allo$f_ddg_pred and pdz3_final_df_ddg_2plot_allo$ESM1v
## S = 10187427, p-value = 6.449e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2570149
p3 <- ggplot(pdz3_final_df_ddg_2plot_allo, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.5) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
  #          inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "365 Allosteric Mutations",
    subtitle = "Spearman's rho = -0.26",
    x = "Measured ddGf",
    y = "",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                       limits = c(-11.3, 15)) +
  theme(legend.position = "none")

p3

pdz3_final_df_ddg_2plot_other <- pdz3_final_df_ddg_2plot %>% filter(allosteric == FALSE) %>% filter(orthosteric == FALSE)
nrow(pdz3_final_df_ddg_2plot_other)
## [1] 960
cor.test(pdz3_final_df_ddg_2plot_other$f_ddg_pred, pdz3_final_df_ddg_2plot_other$ESM1v, method = "spearman") #-0.36
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_other$f_ddg_pred,
## pdz3_final_df_ddg_2plot_other$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  pdz3_final_df_ddg_2plot_other$f_ddg_pred and pdz3_final_df_ddg_2plot_other$ESM1v
## S = 200844875, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.362068
p4 <- ggplot(pdz3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.5) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
  #          inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "960 Other Mutations",
    subtitle = "Spearman's rho = -0.36",
    x = "Measured ddGf",
    y = "",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                         limits = c(-11.3, 15)) +
  theme(legend.position = "none")

p4

fig3C <- plot_grid(p1,p2,p3, p4, ncol = 4, nrow=1)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_scatter.pdf", 
       plot = fig3C, width = 12, height = 3, dpi = 300)
fig3C

pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))


pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
  mutate(site_class = case_when(
    orthosteric == TRUE ~ "orthosteric",
    allosteric == TRUE ~ "allosteric",
    TRUE ~ "other"
  ))

label_df <- pdz3_final_df_ddg_2plot %>%
  group_by(site_class) %>%
  summarise(
    n = n(),
    median_val = median(residuals, na.rm = TRUE),
    .groups = "drop"
  )

pdz3_final_df_ddg_2plot$site_class <- factor(pdz3_final_df_ddg_2plot$site_class, levels = c("orthosteric", "allosteric", "other"))

fig3C <- ggplot(pdz3_final_df_ddg_2plot, aes(x = site_class, y = residuals, fill = site_class)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA)+
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
    geom_text(
    data = label_df,
    aes(x = site_class, y = max(pdz3_final_df_ddg_2plot$residuals) * 1.1,
        label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  geom_text(
    data = label_df,
    aes(x = site_class, y = 10,
        label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
    geom_text(
  data = label_df,
  aes(x = site_class, y = median_val + 1.9, label = sprintf("%.2f", median_val)),
  inherit.aes = FALSE,
  size = 6
) +
    geom_signif(
    comparisons = list(
      c("orthosteric", "allosteric"),
      c("orthosteric", "other"),
      c("allosteric", "other")
    ),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    tip_length = 0.01
  ) +
  
  # Labels and theme
  labs(
    title = "PSD-95-PDZ3",
    subtitle = "",
    x = "",
    y = "ESM1v - ddGf residual"
  ) +
  scale_fill_manual(values = c(
    "orthosteric" = "orange",
    "allosteric" = "cyan3",
    "other" = "darkgreen"
  )) +
  theme_classic() +
  theme(legend.position = "none")

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_violin.pdf", 
       plot = fig3C, width = 3, height = 3, dpi = 300)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).
fig3C
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).

legend <-  ggplot(pdz3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "1,243 Other Mutations",
    subtitle = "Spearman's rho = -0.40",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                        limits = c(-11.3, 15)) +
  theme(legend.position = "top")
legend

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_legend.pdf", 
       plot = legend, width = 5, height = 4, dpi = 300)
bootstrap_r_squared <- function(input_data, fit, n_bootstrap = 1000) {
  marginal_r_squared_values <- numeric(n_bootstrap)
  conditional_r_squared_values <- numeric(n_bootstrap)
  
  for (k in 1:n_bootstrap) {
    indices <- sample(1:nrow(input_data), size = nrow(input_data), replace = TRUE)
    bootstrap_data <- input_data[indices, ]
    
    bootstrap_fit <- tryCatch({
      update(fit, data = bootstrap_data)
    }, error = function(e) {
      NULL
    })
    
    if (!is.null(bootstrap_fit)) {
      r2_values <- r2(bootstrap_fit)
      marginal_r_squared_values[k] <- r2_values$R2_marginal
      conditional_r_squared_values[k] <- r2_values$R2_conditional
    }
  }
  
  marginal_r_squared_sorted <- sort(marginal_r_squared_values, na.last = NA)
  conditional_r_squared_sorted <- sort(conditional_r_squared_values, na.last = NA)
  
  list(
    marginal_r_squared_ci = c(
      lower = marginal_r_squared_sorted[round(n_bootstrap * 0.025)],
      upper = marginal_r_squared_sorted[round(n_bootstrap * 0.975)]
    ),
    conditional_r_squared_ci = c(
      lower = conditional_r_squared_sorted[round(n_bootstrap * 0.025)],
      upper = conditional_r_squared_sorted[round(n_bootstrap * 0.975)]
    ),
    marginal_r_squared_values = marginal_r_squared_values,
    conditional_r_squared_values = conditional_r_squared_values
  )
}

fit_model_and_diagnostics_both_lmm <- function(input_data, model_id) {
  fit <- lmer(ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve), data = input_data)
  #control = lmerControl(check.conv.singular = "ignore"))
  
  input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data, re.form = NULL)
  
  marginal_r_squared <- r2(fit)$R2_marginal
  conditional_r_squared <- r2(fit)$R2_conditional
  
  bootstrap_r2 <- bootstrap_r_squared(input_data, fit, n_bootstrap = 1000)
  
  print(paste("Marginal R-squared:", round(marginal_r_squared, 4)))
  print(paste("Conditional R-squared:", round(conditional_r_squared, 4)))
  
  return(list(
    fit = fit,
    marginal_r_squared = bootstrap_r2$marginal_r_squared_values,
    conditional_r_squared = bootstrap_r2$conditional_r_squared_values,
    marginal_r_squared_ci = c(bootstrap_r2$marginal_r_squared_ci["lower"], 
                              bootstrap_r2$marginal_r_squared_ci["upper"]),
    conditional_squared_ci = c(bootstrap_r2$conditional_r_squared_ci["lower"], 
                               bootstrap_r2$conditional_r_squared_ci["upper"]),
    input_data = input_data
  ))
}


fit_model_and_diagnostics_ddgf_lmm <- function(input_data, model_id) {
  fit <- lmer(ESM1v ~  f_ddg_pred  + (1 | pos_eve), data = input_data)
  
  input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data, re.form = NULL)
  
  marginal_r_squared <- r2(fit)$R2_marginal
  conditional_r_squared <- r2(fit)$R2_conditional
  
  bootstrap_r2 <- bootstrap_r_squared(input_data, fit, n_bootstrap = 1000)
  
  print(paste("Marginal R-squared:", round(marginal_r_squared, 4)))
  print(paste("Conditional R-squared:", round(conditional_r_squared, 4)))
  
  return(list(
    fit = fit,
    marginal_r_squared = bootstrap_r2$marginal_r_squared_values,
    conditional_r_squared = bootstrap_r2$conditional_r_squared_values,
    marginal_r_squared_ci = c(bootstrap_r2$marginal_r_squared_ci["lower"], 
                              bootstrap_r2$marginal_r_squared_ci["upper"]),
    conditional_squared_ci = c(bootstrap_r2$conditional_r_squared_ci["lower"], 
                               bootstrap_r2$conditional_r_squared_ci["upper"]),
    input_data = input_data
  ))
}
library(modeest)
library(Metrics)
library(broom)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(performance)
## 
## Attaching package: 'performance'
## The following objects are masked from 'package:Metrics':
## 
##     mae, mse, rmse
m0_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot, model_id = "m0")
## [1] "Marginal R-squared: 0.3104"
## [1] "Conditional R-squared: 0.676"
# [1] "Marginal R-squared: 0.3104"
# [1] "Conditional R-squared: 0.676"
m1_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot, model_id = "m1")
## [1] "Marginal R-squared: 0.2031"
## [1] "Conditional R-squared: 0.6399"
# [1] "Marginal R-squared: 0.2031"
# [1] "Conditional R-squared: 0.6399"
pdz3_final_df_ddg_2plot_non_active <- pdz3_final_df_ddg_2plot_non_active %>%
  mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))

m3_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Marginal R-squared: 0.3545"
## [1] "Conditional R-squared: 0.6627"
# [1] "Marginal R-squared: 0.3545"
# [1] "Conditional R-squared: 0.6627"
m4_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Marginal R-squared: 0.2441"
## [1] "Conditional R-squared: 0.6211"
# [1] "Marginal R-squared: 0.2441"
# [1] "Conditional R-squared: 0.6211"
pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot_active %>%
  mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))

m6_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Marginal R-squared: 0.266"
## [1] "Conditional R-squared: 0.3795"
# [1] "Marginal R-squared: 0.266"
# [1] "Conditional R-squared: 0.3795"
m7_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Marginal R-squared: 0.1598"
## [1] "Conditional R-squared: 0.2618"
#[1] "Marginal R-squared: 0.1598"
#[1] "Conditional R-squared: 0.2618"
anova(m1_results$fit, m0_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m1_results$fit    4 7575.2 7596.6 -3783.6    7567.2                         
## m0_results$fit    5 7409.7 7436.5 -3699.9    7399.7 167.48  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
# m1_results$fit    4 7575.2 7596.6 -3783.6    7567.2                         
# m0_results$fit    5 7409.7 7436.5 -3699.9    7399.7 167.48  1  < 2.2e-16 ***

anova(m4_results$fit, m3_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m4_results$fit    4 6354.3 6375.1 -3173.2    6346.3                         
## m3_results$fit    5 6215.8 6241.7 -3102.9    6205.8 140.57  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
# m4_results$fit    4 6354.3 6375.1 -3173.2    6346.3                         
# m3_results$fit    5 6215.8 6241.7 -3102.9    6205.8 140.57  1  < 2.2e-16 ***

anova(m7_results$fit, m6_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m7_results$fit    4 1175.3 1189.2 -583.64    1167.3                         
## m6_results$fit    5 1155.6 1173.0 -572.78    1145.6 21.705  1  3.179e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
# m7_results$fit    4 1175.3 1189.2 -583.64    1167.3                         
# m6_results$fit    5 1155.6 1173.0 -572.78    1145.6 21.705  1  3.179e-06 ***
merged_df <- merge(pdz3_final_df_ddg_2plot_all, pdz3_out, by.x="Pos_ref.x", by.y = "Pos")
nrow(merged_df) #1587
## [1] 1587
merged_df_pos <- merged_df %>% filter(b_ddg_pred >= 0)
nrow(merged_df_pos)
## [1] 1335
merged_df_residue <- merged_df_pos %>%
  group_by(Pos_ref.x) %>%
  reframe(
    median_ddgb = median(b_ddg_pred, na.rm = TRUE),
    median_residual = median(residuals, na.rm = TRUE),
    scHAmin_ligand = first(scHAmin_ligand),
    orthosteric = first(orthosteric),
    Pos = first(Pos_ref.x))

nrow(merged_df_residue) #84
## [1] 84
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))
exp_model_pdz3_ddgb
## Nonlinear regression model
##   model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
##    data: merged_df_residue
##       a       b 
## 1.40410 0.07758 
##  residual sum-of-squares: 12.64
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 7.005e-06
# Nonlinear regression model
#   model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
#    data: merged_df_residue
#       a       b 
# 1.40410 0.07758 
#  residual sum-of-squares: 12.64
# 
# Number of iterations to convergence: 5 
# Achieved convergence tolerance: 7.005e-06

a <- 1.40410
b <-  0.07758 
half_d <- log(2) / b  # ≈ 8.934612
half_d
## [1] 8.934612
y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(b_ddg_pred), na.rm=TRUE)

merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[abs(merged_df_residue$median_ddgb) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"


x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
              max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
  fit <- try(nlsLM(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  scHAmin_ligand = x_vals,
  median_ddgb = predict(exp_model_pdz3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)

# Plot
p1 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
  # Base layer: all points
  geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
   # Fit line with confidence ribbon
  geom_ribbon(data = fit_df_residue,
            aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
            fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
          inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
  
  # Manual color palette for site type
  scale_color_manual(values = c(
    "Null" = "darkgreen",
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "orange"
  )) +
  # Reference lines
  geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
  geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "PSD-95-PDZ3: allosteric decay",
    subtitle = "1335 mutations with ddGb >=0",
    x = "Minimal distance to ligand",
    y = "Median ddGb",
    color = ""
  )  + theme(legend.position = "bottom") +
    annotate("text",  x = Inf, y = Inf,
             hjust = 1, vjust = 1,
           label = "y = a * exp (b * x)\na = 1.40410  \nb = -0.07758",
           size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Duplicated aesthetics after name standardisation: hjust
p1 <- ggMarginal(
  p1,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_ddgb_exp.pdf", 
       plot = p1, width = 4, height = 4, dpi = 300)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p1

lm_model <- lm(log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.89470 -0.50236  0.09722  0.49863  1.26543 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.35171    0.19914   1.766   0.0811 .  
## scHAmin_ligand -0.10112    0.01821  -5.554 3.37e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6744 on 82 degrees of freedom
## Multiple R-squared:  0.2733, Adjusted R-squared:  0.2645 
## F-statistic: 30.84 on 1 and 82 DF,  p-value: 3.37e-07
# Call:
# lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
# 
# Residuals:
#      Min       1Q   Median       3Q      Max 
# -1.89470 -0.50236  0.09722  0.49863  1.26543 
# 
# Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
# (Intercept)     0.35171    0.19914   1.766   0.0811 .  
# scHAmin_ligand -0.10112    0.01821  -5.554 3.37e-07 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.6744 on 82 degrees of freedom
# Multiple R-squared:  0.2733,  Adjusted R-squared:  0.2645 
# F-statistic: 30.84 on 1 and 82 DF,  p-value: 3.37e-07
merged_df <- merge(pdz3_final_df_ddg_2plot_all, pdz3_out, by.x="Pos_ref.x", by.y = "Pos")
nrow(merged_df) #1587
## [1] 1587
merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df_neg)
## [1] 898
merged_df_residue <- merged_df_neg %>%
  group_by(Pos_ref.x) %>%
  reframe(
    median_ddgb = median(b_ddg_pred, na.rm = TRUE),
    median_residual = median(residuals, na.rm = TRUE),
    scHAmin_ligand = first(scHAmin_ligand),
    orthosteric = first(orthosteric),
    Pos = first(Pos_ref.x))

nrow(merged_df_residue) #82
## [1] 82
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_res <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))
exp_model_pdz3_res
## Nonlinear regression model
##   model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
##    data: merged_df_residue
##      a      b 
## 7.3547 0.1291 
##  residual sum-of-squares: 121
## 
## Number of iterations to convergence: 6 
## Achieved convergence tolerance: 8.824e-07
# Nonlinear regression model
#   model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
#    data: merged_df_residue
#      a      b 
# 7.3547 0.1291 
#  residual sum-of-squares: 121
# 
# Number of iterations to convergence: 6 
# Achieved convergence tolerance: 8.824e-07

a <- 7.3547
b <-  0.1291
half_d <- log(2) / b  # ≈ 5.369072
half_d
## [1] 5.369072
y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(residuals), na.rm = TRUE)

merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[abs(merged_df_residue$median_residual) <= abs(y_cutoff)] <- "Null"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
table(merged_df_residue$site_type)
## 
##         Binding site Non-orthosteric site                 Null 
##                   13                    8                   61
x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
              max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)

# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
  samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
  fit <- try(nlsLM(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
                   data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
  if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]

boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))

fit_df_residue <- data.frame(
  scHAmin_ligand = x_vals,
  median_ddgb = predict(exp_model_pdz3_res, newdata = data.frame(scHAmin_ligand = x_vals)),
  lower = apply(boot_preds, 1, quantile, probs = 0.025),
  upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
# Plot
p2 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
  # Base layer: all points
  geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
  geom_ribbon(data = fit_df_residue,
            aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
            fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
  geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
          inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
  
  # Manual color palette for site type
  scale_color_manual(values = c(
    "Null" = "darkgreen",
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "orange"
  )) +
  # Reference lines
  geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
  geom_hline(yintercept = abs(y_cutoff), linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "PSD-95-PDZ3: allosteric decay",
    subtitle = "898 mutations with residuals <= 0",
    x = "Minimal distance to ligand",
    y = "|Median ESM1v-ddGf residual|",
    color = ""
  )  + theme(legend.position = "bottom") +
    annotate("text",  x = Inf, y = Inf,
             hjust = 1, vjust = 1,
           label = "y = a * exp (b * x)\na = 7.3547  \nb = -0.1291",
           size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p2 <- ggMarginal(
  p2,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_residual_decay.pdf", 
       plot = p2, width = 4, height = 4, dpi = 300)
p2

lm_model <- lm(log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
summary(lm_model)
## 
## Call:
## lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6808 -0.4011  0.2040  0.5331  1.4222 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.87648    0.27421   6.843 1.42e-09 ***
## scHAmin_ligand -0.14203    0.02518  -5.642 2.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9201 on 80 degrees of freedom
## Multiple R-squared:  0.2846, Adjusted R-squared:  0.2757 
## F-statistic: 31.83 on 1 and 80 DF,  p-value: 2.461e-07
# Call:
# lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
# 
# Residuals:
#     Min      1Q  Median      3Q     Max 
# -3.6808 -0.4011  0.2040  0.5331  1.4222 
# 
# Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
# (Intercept)     1.87648    0.27421   6.843 1.42e-09 ***
# scHAmin_ligand -0.14203    0.02518  -5.642 2.46e-07 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.9201 on 80 degrees of freedom
# Multiple R-squared:  0.2846,  Adjusted R-squared:  0.2757 
# F-statistic: 31.83 on 1 and 80 DF,  p-value: 2.461e-07
nrow(merged_df) #1587
## [1] 1587
cor.test(merged_df$residuals, merged_df$b_ddg_pred) #-0.2958665
## 
##  Pearson's product-moment correlation
## 
## data:  merged_df$residuals and merged_df$b_ddg_pred
## t = -12.249, df = 1564, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3404126 -0.2499953
## sample estimates:
##        cor 
## -0.2958665
head(merged_df)
##   Pos_ref.x id_eve X.x V1.x id_old.x pos_am.x trait_name.x  f_ddg_pred
## 1       311  P354A   1    2      P1A        1      Folding -0.11463383
## 2       311  P354C   2    3      P1C        1      Folding  0.05651265
## 3       311  P354D   3    4      P1D        1      Folding  0.26523510
## 4       311  P354E   4    5      P1E        1      Folding -0.48300818
## 5       311  P354F   5    6      P1F        1      Folding  0.26560825
## 6       311  P354G   6    7      P1G        1      Folding  0.25143468
##   f_ddg_pred_sd ci95_kcal.mol.x     library.x assay.x pdz.x pdz_name.x
## 1    0.17292605       0.6778701 761_abundance folding   761  DLG4_PDZ3
## 2    0.07504865       0.2941907 761_abundance folding   761  DLG4_PDZ3
## 3    0.17022757       0.6672921 761_abundance folding   761  DLG4_PDZ3
## 4    0.23419525       0.9180454 761_abundance folding   761  DLG4_PDZ3
## 5    0.30217642       1.1845316 761_abundance folding   761  DLG4_PDZ3
## 6    0.04204546       0.1648182 761_abundance folding   761  DLG4_PDZ3
##   alignment_position.x WT_aa.x structural_alignment_pos.x   consv.x
## 1                  V11       P                         11 0.0607867
## 2                  V11       P                         11 0.0607867
## 3                  V11       P                         11 0.0607867
## 4                  V11       P                         11 0.0607867
## 5                  V11       P                         11 0.0607867
## 6                  V11       P                         11 0.0607867
##   HAmin_ligand.x scHAmin_ligand.x     X1.x     X2.x     X3.x     X4.x     X5.x
## 1       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 2       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 3       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 4       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 5       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 6       18.38841         18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
##       X6.x     X7.x     X8.x     X9.x closest_ligand_pos.x closest_ligand_aa.x
## 1 22.35791 22.23024 23.87194 18.38841                    9                   V
## 2 22.35791 22.23024 23.87194 18.38841                    9                   V
## 3 22.35791 22.23024 23.87194 18.38841                    9                   V
## 4 22.35791 22.23024 23.87194 18.38841                    9                   V
## 5 22.35791 22.23024 23.87194 18.38841                    9                   V
## 6 22.35791 22.23024 23.87194 18.38841                    9                   V
##   binding_interface_5A.x WT_aa.1.x secondary_structure.x   rsasa.x
## 1                  FALSE         P                  loop 0.9242239
## 2                  FALSE         P                  loop 0.9242239
## 3                  FALSE         P                  loop 0.9242239
## 4                  FALSE         P                  loop 0.9242239
## 5                  FALSE         P                  loop 0.9242239
## 6                  FALSE         P                  loop 0.9242239
##   structure_location.x wt_aa.x mt_aa.x pos_eve.x popEVE.x ESM1v.x  X.y V1.y
## 1              surface       P       A       354   -3.710  -5.226 1588 1590
## 2              surface       P       C       354   -4.395  -9.925 1589 1591
## 3              surface       P       D       354   -4.804 -12.238 1590 1592
## 4              surface       P       E       354   -4.425 -10.872 1591 1593
## 5              surface       P       F       354   -4.521 -10.748 1592 1594
## 6              surface       P       G       354   -4.307  -9.615 1593 1595
##   id_old.y pos_am.y trait_name.y b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y
## 1      P1A        1      Binding  0.1978893     0.0608342      0.23847008
## 2      P1C        1      Binding  0.4602309     0.3331480      1.30594030
## 3      P1D        1      Binding  0.2162344     0.1601899      0.62794460
## 4      P1E        1      Binding  0.1494236     0.1641081      0.64330380
## 5      P1F        1      Binding  0.3684195     0.1401020      0.54920000
## 6      P1G        1      Binding  0.3154340     0.0236291      0.09262606
##   library.y assay.y pdz.y pdz_name.y alignment_position.y WT_aa.y
## 1   761_808 binding   761  DLG4_PDZ3                  V11       P
## 2   761_808 binding   761  DLG4_PDZ3                  V11       P
## 3   761_808 binding   761  DLG4_PDZ3                  V11       P
## 4   761_808 binding   761  DLG4_PDZ3                  V11       P
## 5   761_808 binding   761  DLG4_PDZ3                  V11       P
## 6   761_808 binding   761  DLG4_PDZ3                  V11       P
##   structural_alignment_pos.y   consv.y HAmin_ligand.y scHAmin_ligand.y     X1.y
## 1                         11 0.0607867       18.38841         18.38841 30.40709
## 2                         11 0.0607867       18.38841         18.38841 30.40709
## 3                         11 0.0607867       18.38841         18.38841 30.40709
## 4                         11 0.0607867       18.38841         18.38841 30.40709
## 5                         11 0.0607867       18.38841         18.38841 30.40709
## 6                         11 0.0607867       18.38841         18.38841 30.40709
##       X2.y     X3.y     X4.y     X5.y     X6.y     X7.y     X8.y     X9.y
## 1 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 2 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 3 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 4 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 5 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 6 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
##   Pos_ref.y closest_ligand_pos.y closest_ligand_aa.y binding_interface_5A.y
## 1       311                    9                   V                  FALSE
## 2       311                    9                   V                  FALSE
## 3       311                    9                   V                  FALSE
## 4       311                    9                   V                  FALSE
## 5       311                    9                   V                  FALSE
## 6       311                    9                   V                  FALSE
##   WT_aa.1.y secondary_structure.y   rsasa.y structure_location.y wt_aa.y
## 1         P                  loop 0.9242239              surface       P
## 2         P                  loop 0.9242239              surface       P
## 3         P                  loop 0.9242239              surface       P
## 4         P                  loop 0.9242239              surface       P
## 5         P                  loop 0.9242239              surface       P
## 6         P                  loop 0.9242239              surface       P
##   mt_aa.y pos_eve.y popEVE.y ESM1v.y     z_stab     p_stab   z_destab  p_destab
## 1       A       354   -3.710  -5.226 -0.6629067 0.25369517 -3.5543160 0.9998105
## 2       C       354   -4.395  -9.925  0.7530135 0.77427911 -5.9093315 1.0000000
## 3       D       354   -4.804 -12.238  1.5581207 0.94039768 -1.3791238 0.9160717
## 4       E       354   -4.425 -10.872 -2.0624166 0.01958404 -4.1973874 0.9999865
## 5       F       354   -4.521 -10.748  0.8789840 0.81029503 -0.7756785 0.7810306
## 6       G       354   -4.307  -9.615  5.9800680 1.00000000 -5.9118238 1.0000000
##         class orthosteric    fitted residuals HAmin_ligand scHAmin_ligand
## 1   no change       FALSE -8.064056  2.838056     18.37533       18.60036
## 2   no change       FALSE -8.167056 -1.757944     18.37533       18.60036
## 3   no change       FALSE -8.647162 -3.590838     18.37533       18.60036
## 4 stabilizing       FALSE -8.181711 -2.690289     18.37533       18.60036
## 5   no change       FALSE -8.648514 -2.099486     18.37533       18.60036
## 6   no change       FALSE -8.597645 -1.017355     18.37533       18.60036
##          5        6        7        8        9
## 1 24.60584 22.42312 22.65934 24.04283 18.60036
## 2 24.60584 22.42312 22.65934 24.04283 18.60036
## 3 24.60584 22.42312 22.65934 24.04283 18.60036
## 4 24.60584 22.42312 22.65934 24.04283 18.60036
## 5 24.60584 22.42312 22.65934 24.04283 18.60036
## 6 24.60584 22.42312 22.65934 24.04283 18.60036
merged_df_residue <- merged_df %>%
  group_by(Pos_ref.x) %>%
  reframe(
    median_ddgb = median(b_ddg_pred, na.rm = TRUE),
    median_residual = median(residuals, na.rm = TRUE),
    scHAmin_ligand = first(scHAmin_ligand),
    orthosteric = first(orthosteric),
    Pos = first(Pos_ref.x))

nrow(merged_df_residue) #84
## [1] 84
cor.test(merged_df_residue$median_residual, merged_df_residue$median_ddgb, method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  merged_df_residue$median_residual and merged_df_residue$median_ddgb
## S = 129482, p-value = 0.004137
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.3109446
#   Spearman's rank correlation rho
# 
# data:  merged_df_residue$median_residual and merged_df_residue$median_ddgb
# S = 129482, p-value = 0.004137
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
#        rho 
# -0.3109446

merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"

figS3E <- ggplot(merged_df_residue, aes(x = median_ddgb, y =  median_residual, color = site_type)) +
  geom_point(size = 2, alpha = 0.5) +
  labs(
    title = "PSD-95-PDZ3: 84 residues (all 1587 mutations)",
    subtitle = "Spearman's rho = -0.31",
    x = "Median ddGb",
    y = "Median ESM1v-ddGf residual",
    color = "") +
  theme_classic() +
    scale_color_manual(values = c(
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "orange"
  )) + theme(legend.position = "bottom") +
  geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") 
  #xlim(-3,2) + ylim(-5,5)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_corr.pdf", 
       plot = figS3E, width = 4, height = 5, dpi = 300)
figS3E

merged_df_neg_ddgb <- merged_df %>% filter(b_ddg_pred < 0)
nrow(merged_df_neg_ddgb) #232
## [1] 232
merged_df_residue <- merged_df_neg_ddgb %>%
  group_by(Pos_ref.x) %>%
  reframe(
    median_ddgb = median(b_ddg_pred, na.rm = TRUE),
    median_residual = median(residuals, na.rm = TRUE),
    scHAmin_ligand = first(scHAmin_ligand),
    orthosteric = first(orthosteric),
    Pos = first(Pos_ref.x))

nrow(merged_df_residue) #
## [1] 60
merged_df_residue_nonfunc <- merged_df_residue %>% filter(orthosteric == FALSE)

# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))
exp_model_pdz3_ddgb
## Nonlinear regression model
##   model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
##    data: merged_df_residue
##      a      b 
## 0.9770 0.1554 
##  residual sum-of-squares: 3.51
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 2.666e-06
# Nonlinear regression model
#   model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
#    data: merged_df_residue
#      a      b 
# 0.9770 0.1554 
#  residual sum-of-squares: 3.51
# 
# Number of iterations to convergence: 8 
# Achieved convergence tolerance: 2.666e-06

a <- 0.9770
b <-  0.1554 
half_d <- log(2) / b  
half_d
## [1] 4.460407
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"

x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
              max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)

predicted_y <- predict(exp_model_pdz3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals))

fit_df <- data.frame(scHAmin_ligand = x_vals, b_ddg_pred = predicted_y)

nrow(merged_df_residue)
## [1] 60
p3<- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
  # Base layer: all points
  geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
  
  # Loess fit line
  geom_line(data = fit_df, aes(x = scHAmin_ligand, y = b_ddg_pred),
            inherit.aes = FALSE, color = "black", size = 1) +
  geom_text_repel(data = merged_df_residue %>% filter(site_type != "Non-orthosteric site"),aes(label=Pos))+
  
  # Manual color palette for site type
  scale_color_manual(values = c(
    "Null" = "grey",
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "orange"
  )) +
  # Reference lines
  geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  labs(
    title = "PSD-95-PDZ3: allosteric decay",
    subtitle = "232 mutations with ddGb < 0",
    x = "Minimal distance to ligand",
    y = "Median ddGb",
    color = ""
  )  + theme(legend.position = "bottom") +
    annotate("text",  x = Inf, y = Inf,
             hjust = 1, vjust = 1,
           label = "y = a * exp (b * x)\na =  0.9770\nb = -0.1554 ",
           size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p3 <- ggMarginal(
  p3,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)
p3

merged_df_neg_residual <- merged_df %>% filter(residuals > 0)
nrow(merged_df_neg_residual) #688
## [1] 688
merged_df_residue <- merged_df_neg_residual %>%
  group_by(Pos_ref.x) %>%
  reframe(
    median_ddgb = median(b_ddg_pred, na.rm = TRUE),
    median_residual = median(residuals, na.rm = TRUE),
    scHAmin_ligand = first(scHAmin_ligand),
    orthosteric = first(orthosteric),
    Pos = first(Pos_ref.x))

nrow(merged_df_residue) #81
## [1] 81
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_residual <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
                 data = merged_df_residue,
                 start = list(a = 1, b = 0.1))

exp_model_pdz3_residual 
## Nonlinear regression model
##   model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
##    data: merged_df_residue
##        a        b 
##  1.43905 -0.02959 
##  residual sum-of-squares: 118.8
## 
## Number of iterations to convergence: 12 
## Achieved convergence tolerance: 3.042e-06
# Nonlinear regression model
#   model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
#    data: merged_df_residue
#        a        b 
#  1.43905 -0.02959 
#  residual sum-of-squares: 118.8
# 
# Number of iterations to convergence: 12 
# Achieved convergence tolerance: 3.042e-06

a <- 1.43905
b <-  0.02959 
half_d <- log(2) / b  # ≈  
half_d
## [1] 23.42505
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"

x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
              max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)

predicted_y <- predict(exp_model_pdz3_residual, newdata = data.frame(scHAmin_ligand = x_vals))

fit_df <- data.frame(scHAmin_ligand = x_vals, residual = predicted_y)

# Plot
p4 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
  # Base layer: all points
  geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
  
  # Loess fit line
  geom_line(data = fit_df , aes(x = scHAmin_ligand, y = residual),
            inherit.aes = FALSE, color = "black", size = 1) +

  
  # Manual color palette for site type
  scale_color_manual(values = c(
    "Non-orthosteric site" = "darkgreen",
    "Binding site" = "orange"
  )) +
  # Reference lines
  geom_vline(xintercept = 4.315068, linetype = "dashed", color = "slategrey") +
  geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
  theme_classic() +
  geom_text_repel(data = merged_df_residue %>% filter(site_type == "Binding site"), aes(label=Pos))+
  labs(
    title = "PSD-95-PDZ3: allosteric decay",
    subtitle = "688 mutations with residual > 0",
    x = "Minimal distance to ligand",
    y = "|Median ESM1v-ddGf residual|",
    color = ""
  )  + theme(legend.position = "bottom") +
    annotate("text",  x = Inf, y = Inf,
             hjust = 1, vjust = 1,
           label = "y = a * exp (b * x)\na =  1.43905\nb = -0.02959",
           size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p4 <- ggMarginal(
  p4,
  type = "density",
  margins = "both",
  groupColour = FALSE,
  groupFill = FALSE,
  size = 10,
  colour = "grey",
  fill = "lightgrey"
)

p4
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

figS3F <- plot_grid(p3,p4, ncol = 2, nrow=1)
figS3F
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

sh3_final_df_ddgf <- read_tsv('/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/paper_supplements/domains_andre/GRB2-SH3/Abundance/mochi_project/task_1/weights/weights_Folding.txt')
## Rows: 1057 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): id, id_ref, trait_name
## dbl (19): Pos, Pos_ref, fold_1, fold_2, fold_3, fold_4, fold_5, fold_6, fold...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sh3_final_df_ddgf <- sh3_final_df_ddgf %>% dplyr::select(id_ref, Pos_ref, `mean_kcal/mol`, `std_kcal/mol`)
dim(sh3_final_df_ddgf) #1057    4
## [1] 1057    4
sh3_final_df_ddgb <- read_tsv('/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/paper_supplements/domains_andre/GRB2-SH3/Abundance/mochi_project/task_1/weights/weights_Binding.txt')
## Rows: 757 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (3): id, id_ref, trait_name
## dbl (19): Pos, Pos_ref, fold_1, fold_2, fold_3, fold_4, fold_5, fold_6, fold...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sh3_final_df_ddgb <- sh3_final_df_ddgb %>% dplyr::select(id_ref, Pos_ref, `mean_kcal/mol`, `std_kcal/mol`)
dim(sh3_final_df_ddgb) #757   4
## [1] 757   4
sh3_data <- process_protein_data(
  consurf_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/3_consurf/grb2_consurf/new_cleaned_file.txt',
  am_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/0_am/alphamissense_grb2.tsv',
  esm1b_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/1_esm1b/GRB2_P62993_esm1b.csv',
  popeve_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/2_popeve/GRB2_NP_002077.1.csv'
)
## Rows: 217 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4340 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
process_and_merge_protein_data <- function(protein_data, merged_df) {

  # Merge with popEVE and ESM1v data
  merged_df_ddg <- merged_df %>%
    merge(protein_data$popeve, by.x = "id_ref", by.y = "id",all.x = FALSE) %>%
    merge(protein_data$esm1v, by.x = "id_ref", by.y = "id", all.x = FALSE)

  return(merged_df_ddg)
}

sh3_final_df_ddgf <- sh3_final_df_ddgf %>%
  mutate(wt_aa = str_extract(id_ref, '^[A-Za-z]+'),
         mt_aa = str_extract(id_ref, '[A-Za-z]+$')) %>%
  rename(old_id = id_ref) %>%
  rename(old_Pos = Pos_ref) %>%
  rename(f_ddg_pred = `mean_kcal/mol`) %>%
  rename(f_ddg_pred_sd = `std_kcal/mol`) %>%
  mutate(Pos_ref = as.numeric(old_Pos) + 158) %>%
  mutate(id_ref = paste0(wt_aa, Pos_ref, mt_aa))

sh3_final_df_ddgf <-process_and_merge_protein_data(sh3_data, sh3_final_df_ddgf)
dim(sh3_final_df_ddgf) # 1056   10
## [1] 1056   10
sh3_final_df_ddgb <- sh3_final_df_ddgb %>%
  mutate(wt_aa = str_extract(id_ref, '^[A-Za-z]+'),
         mt_aa = str_extract(id_ref, '[A-Za-z]+$')) %>%
  rename(old_id = id_ref) %>%
  rename(old_Pos = Pos_ref) %>%
    rename(b_ddg_pred = `mean_kcal/mol`) %>%
  rename(b_ddg_pred_sd = `std_kcal/mol`) %>%
  mutate(Pos_ref = as.numeric(old_Pos) + 158) %>%
  mutate(id_ref = paste0(wt_aa, Pos_ref, mt_aa))

sh3_final_df_ddgb <-process_and_merge_protein_data(sh3_data, sh3_final_df_ddgb)
dim(sh3_final_df_ddgb) # 756  10
## [1] 756  10
################################# Stability Definition #########################################
sh3_final_df_ddg_2plot <- merge(sh3_final_df_ddgf, sh3_final_df_ddgb, by = "id_ref", all.x = TRUE)
dim(sh3_final_df_ddg_2plot) #1056   19
## [1] 1056   19
# Step 1: Calculate z-scores and p-values for stabilizing mutations
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(z_stab = (f_ddg_pred - 0) / f_ddg_pred_sd)

sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(p_stab = pnorm(z_stab))

# Step 2: Calculate z-scores and p-values for destabilizing mutations
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(z_destab = (f_ddg_pred - 0.5) / f_ddg_pred_sd,
         p_destab = 1 - pnorm(z_destab))

# Step 3: Classify mutations based on p-values
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(class = case_when(
    p_stab < 0.05 ~ "stabilizing",
    p_destab < 0.05 ~ "destabilizing",
    TRUE ~ "no change"
  ))

# Print summary of classification counts
class_summary <- table(sh3_final_df_ddg_2plot$class)
print(class_summary)
## 
## destabilizing     no change   stabilizing 
##           585           411            60
#destabilizing     no change   stabilizing 
#          585           411            60 
paper_anno <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/cleaned_ddg/sh3_ddg_cleaned.csv")
table(paper_anno$orthosteric)
## 
## TRUE 
##  152
head(paper_anno)
##      id Pos_real Pos_ref old_id id_ref Pos mut_order f_dg_pred f_ddg_pred
## 1 A163C      163       5    A5C    A5C   5         1 1.2871009   2.118572
## 2 A163D      163       5    A5D    A5D   5         1 0.9224186   1.753889
## 3 A163E      163       5    A5E    A5E   5         1 1.3930653   2.224536
## 4 A163F      163       5    A5F    A5F   5         1 1.8272556   2.658726
## 5 A163G      163       5    A5G    A5G   5         1 0.4485680   1.280039
## 6 A163H      163       5    A5H    A5H   5         1 1.4787602   2.310231
##   f_ddg_pred_sd  b_dg_pred b_ddg_pred b_ddg_pred_sd f_ddg_pred_conf
## 1    0.02432856         NA         NA            NA            TRUE
## 2    0.02558423 -0.2844405  0.7605213    0.03313322            TRUE
## 3    0.02310688 -2.4969005 -1.4519387    0.62053013            TRUE
## 4    0.08655548         NA         NA            NA            TRUE
## 5    0.01206305 -1.2700695 -0.2251077    0.01497124            TRUE
## 6    0.04339128         NA         NA            NA            TRUE
##   b_ddg_pred_conf HAmin_ligand scHAmin_ligand RSASA    SS Pos_class  protein
## 1           FALSE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
## 2            TRUE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
## 3           FALSE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
## 4           FALSE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
## 5            TRUE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
## 6           FALSE     9.511215       10.33203  0.06 sheet      core GRB2-SH3
##   WT_AA Mut b_ddg_wposmeanabs b_ddg_wposse allosteric orthosteric
## 1     A   C         0.3868335  0.009327359         NA          NA
## 2     A   D         0.3868335  0.009327359         NA          NA
## 3     A   E         0.3868335  0.009327359         NA          NA
## 4     A   F         0.3868335  0.009327359         NA          NA
## 5     A   G         0.3868335  0.009327359         NA          NA
## 6     A   H         0.3868335  0.009327359         NA          NA
##   allosteric_mutation wt_aa.x mt_aa wt_aa.y      atom consurf color
## 1                  NA       A     C       A ALA:163:A  -1.131     9
## 2                TRUE       A     D       A ALA:163:A  -1.131     9
## 3                  NA       A     E       A ALA:163:A  -1.131     9
## 4                  NA       A     F       A ALA:163:A  -1.131     9
## 5               FALSE       A     G       A ALA:163:A  -1.131     9
## 6                  NA       A     H       A ALA:163:A  -1.131     9
##   confidence_interval msa_data     RESIDUE.VARIETY     AM  AM_logit   ESM1b
## 1 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.9660 3.3468033 -12.660
## 2 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.9998 8.5169932 -17.821
## 3 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.9997 8.1114280 -16.761
## 4 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.9991 7.0122154 -18.435
## 5 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.5857 0.3462174  -7.498
## 6 -1.307, -1.003  9,8   47/150 A 93%, S  4%, E  2% 0.9998 8.5169932 -20.762
##   popEVE   ESM1v   EVE
## 1 -4.803  -8.415 6.970
## 2 -5.620 -12.132 8.900
## 3 -5.563 -11.812 8.842
## 4 -5.646 -12.985 8.218
## 5 -4.392  -5.703 6.908
## 6 -6.062 -14.734 9.419
ortho_list <- unique(paper_anno %>% filter(orthosteric == TRUE) %>% pull(Pos_real))

ortho_list<- append(ortho_list, c(170 ,192 ,195 ,204 ,209))

sh3_final_df_ddg_2plot$orthosteric <- FALSE
sh3_final_df_ddg_2plot$orthosteric[sh3_final_df_ddg_2plot$Pos_ref.x %in% ortho_list] <- TRUE

table(sh3_final_df_ddg_2plot$orthosteric)
## 
## FALSE  TRUE 
##   809   247
#FALSE  TRUE 
#  809   247 

sh3_final_df_ddg_2plot_all <- sh3_final_df_ddg_2plot
nrow(sh3_final_df_ddg_2plot_all) #1056
## [1] 1056
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot[!is.na(sh3_final_df_ddg_2plot$b_ddg_pred), ]
nrow(sh3_final_df_ddg_2plot) #756
## [1] 756
head(sh3_final_df_ddg_2plot)
##    id_ref old_id.x old_Pos.x f_ddg_pred f_ddg_pred_sd wt_aa.x mt_aa.x Pos_ref.x
## 2   A163D      A5D         5   1.839457    0.08964487       A       D       163
## 3   A163E      A5E         5   2.279397    0.05693226       A       E       163
## 5   A163G      A5G         5   1.345365    0.02795549       A       G       163
## 7   A163I      A5I         5   2.784022    0.18132980       A       I       163
## 11  A163N      A5N         5   2.413522    0.08280685       A       N       163
## 12  A163P      A5P         5   2.028785    0.05923400       A       P       163
##    popEVE.x ESM1v.x old_id.y old_Pos.y b_ddg_pred b_ddg_pred_sd wt_aa.y mt_aa.y
## 2    -5.620 -12.132      A5D         5  0.7594426    0.13786146       A       D
## 3    -5.563 -11.812      A5E         5 -1.4261234    0.16315904       A       E
## 5    -4.392  -5.703      A5G         5 -0.2783955    0.02497829       A       G
## 7    -5.717 -11.222      A5I         5 -0.8344887    0.55209774       A       I
## 11   -5.778 -13.038      A5N         5  0.5357424    0.51385355       A       N
## 12   -5.446 -10.389      A5P         5  0.5694963    0.14353457       A       P
##    Pos_ref.y popEVE.y ESM1v.y   z_stab p_stab z_destab p_destab         class
## 2        163   -5.620 -12.132 20.51938      1 14.94182        0 destabilizing
## 3        163   -5.563 -11.812 40.03701      1 31.25464        0 destabilizing
## 5        163   -4.392  -5.703 48.12525      1 30.23968        0 destabilizing
## 7        163   -5.717 -11.222 15.35336      1 12.59595        0 destabilizing
## 11       163   -5.778 -13.038 29.14641      1 23.10826        0 destabilizing
## 12       163   -5.446 -10.389 34.25034      1 25.80924        0 destabilizing
##    orthosteric
## 2        FALSE
## 3        FALSE
## 5        FALSE
## 7        FALSE
## 11       FALSE
## 12       FALSE
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>% dplyr::select(id_ref, f_ddg_pred, f_ddg_pred_sd,
                                                                     ESM1v.x, b_ddg_pred, b_ddg_pred_sd,
                                                                     orthosteric,class) %>%
  dplyr::rename(ESM1v = ESM1v.x)

sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
mean(sh3_final_df_ddg_2plot_active$b_ddg_pred) #0.7443709
## [1] 0.7443709
sh3_final_df_ddg_2plot$allosteric <- FALSE
sh3_final_df_ddg_2plot$allosteric[sh3_final_df_ddg_2plot$b_ddg_pred >= mean(sh3_final_df_ddg_2plot_active$b_ddg_pred)] <- TRUE
sh3_final_df_ddg_2plot$allosteric[sh3_final_df_ddg_2plot$orthosteric == TRUE] <- FALSE
table(sh3_final_df_ddg_2plot$allosteric)
## 
## FALSE  TRUE 
##   703    53
#FALSE  TRUE 
#  703    53 
sh3_final_df_ddg_2plot_out <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)

# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = sh3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model 
nrow(sh3_final_df_ddg_2plot_all)
## [1] 1056
sh3_final_df_ddg_2plot_all$fitted <- predict(loess_fit, newdata = sh3_final_df_ddg_2plot_all)

# Calculate residuals for ALL points
sh3_final_df_ddg_2plot_all$residuals <- sh3_final_df_ddg_2plot_all$ESM1v.x - sh3_final_df_ddg_2plot_all$fitted

range(sh3_final_df_ddg_2plot_all$residuals, na.rm = TRUE) #-13.251857   5.372929
## [1] -13.251857   5.372929
sh3_final_df_ddg_2plot_all <- sh3_final_df_ddg_2plot_all %>%
  mutate(mutation_position = as.numeric(str_extract(old_id.x, "(?<=\\D)(\\d+)(?=\\D)")))

median_residuals <- sh3_final_df_ddg_2plot_all %>%
  dplyr::group_by(mutation_position) %>%
  summarise(median_residuals = median(residuals, na.rm = TRUE))

min(median_residuals$median_residuals) #-8.978788
## [1] -8.978788
max(median_residuals$median_residuals) #2.738424
## [1] 2.738424
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf.pdb")
##    PDB has ALT records, taking A only, rm.alt=TRUE
data <- median_residuals
head(data)
## # A tibble: 6 × 2
##   mutation_position median_residuals
##               <dbl>            <dbl>
## 1                 1            0.649
## 2                 2           -2.35 
## 3                 3           -1.80 
## 4                 4           -3.22 
## 5                 5           -0.182
## 6                 6           -1.86
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b

# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
  position <- data$mutation_position[i]
  correlation_value <- data$median_residuals[i]
  
  # Find indices in the PDB that match the current position
  indices <- which(pdb$atom$resno == position)
  
  # Print the indices and current B-factors before updating
  #cat("Updating residue number:", position, "\n")
  #cat("Indices in PDB:", indices, "\n")
  #cat("Current B-factors:", new_b_factors[indices], "\n")
  
  # Update B-factors for all atoms in the current residue
  new_b_factors[indices] <- correlation_value
  
  # Print the new B-factors after updating
  #cat("Updated B-factors:", new_b_factors[indices], "\n")
  #cat("\n")  # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999

# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf_median_residual_loess.pdb")
sh3_final_df_ddg_2plot_out <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)

# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = sh3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")

# Predict fitted values for ALL data points using the loess model 
sh3_final_df_ddg_2plot$fitted <- predict(loess_fit, newdata = sh3_final_df_ddg_2plot)

# Calculate residuals for ALL points
sh3_final_df_ddg_2plot$residuals <- sh3_final_df_ddg_2plot$ESM1v - sh3_final_df_ddg_2plot$fitted

sum(is.na(sh3_final_df_ddg_2plot$residuals))
## [1] 1
sum(is.na(sh3_final_df_ddg_2plot$fitted))
## [1] 1
range(sh3_final_df_ddg_2plot$residuals, na.rm = TRUE) #-13.251857   5.372929
## [1] -13.251857   5.372929
range(sh3_final_df_ddg_2plot$ESM1v) #-17.80   2.73
## [1] -17.80   2.73
range(sh3_final_df_ddg_2plot$f_ddg_pred) #-0.8155064  3.0465777
## [1] -0.8155064  3.0465777
fit_line_df <- data.frame(
  f_ddg_pred = seq(min(0, na.rm = TRUE),
                   max(sh3_final_df_ddg_2plot_out$f_ddg_pred, na.rm = TRUE),
                   length.out = 200)
)

fit_line_df$ESM1v <- predict(loess_fit, newdata = fit_line_df)
cor.test(sh3_final_df_ddg_2plot$f_ddg_pred, sh3_final_df_ddg_2plot$ESM1v, method = "spearman") #--0.5766367 
## Warning in cor.test.default(sh3_final_df_ddg_2plot$f_ddg_pred,
## sh3_final_df_ddg_2plot$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  sh3_final_df_ddg_2plot$f_ddg_pred and sh3_final_df_ddg_2plot$ESM1v
## S = 113538982, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5766367
nrow(sh3_final_df_ddg_2plot)
## [1] 756
p1 <- ggplot(sh3_final_df_ddg_2plot, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.8) +
  labs(
    title = "GRB2-SH3: All 756 Mutations",
    subtitle = "Spearman's rho = -0.58",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-18, 2.8) + xlim(-0.9, 3.1) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                        limits = c(-13.3, 5.4)) +
  theme(legend.position = "none")

p1

sh3_final_df_ddg_2plot_int <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_int)
## [1] 171
cor.test(sh3_final_df_ddg_2plot_int$f_ddg_pred, sh3_final_df_ddg_2plot_int$ESM1v, method = "spearman") #-0.40
## Warning in cor.test.default(sh3_final_df_ddg_2plot_int$f_ddg_pred,
## sh3_final_df_ddg_2plot_int$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  sh3_final_df_ddg_2plot_int$f_ddg_pred and sh3_final_df_ddg_2plot_int$ESM1v
## S = 1168634, p-value = 4.893e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.4023498
p2 <- ggplot(sh3_final_df_ddg_2plot_int, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.6) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  labs(
    title = "171 Orthosteric Mutations",
    subtitle = "Spearman's rho = -0.40",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-18, 2.8) + xlim(-0.9, 3.1) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                        limits = c(-13.3, 5.4)) +
  theme(legend.position = "none")

p2

sh3_final_df_ddg_2plot_allo <- sh3_final_df_ddg_2plot %>% filter(allosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_allo)
## [1] 53
cor.test(sh3_final_df_ddg_2plot_allo$f_ddg_pred, sh3_final_df_ddg_2plot_allo$ESM1v, method = "spearman") #-0.55
## 
##  Spearman's rank correlation rho
## 
## data:  sh3_final_df_ddg_2plot_allo$f_ddg_pred and sh3_final_df_ddg_2plot_allo$ESM1v
## S = 38442, p-value = 2.745e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.5498307
unique(sh3_final_df_ddg_2plot_allo$id_ref)
##  [1] "A163D" "A163V" "A197P" "D181L" "F177H" "F177S" "F182W" "F205C" "F205N"
## [10] "F205S" "G173A" "G173C" "G173L" "G173R" "G173V" "G173W" "G196L" "G196R"
## [19] "G203C" "G203D" "G203F" "G203H" "G203I" "G203L" "G203S" "G203T" "G203Y"
## [28] "H184P" "I183F" "I183M" "I183N" "I183S" "I183T" "L164P" "L175K" "L175P"
## [37] "L175Q" "L175R" "M186G" "N188M" "Q162M" "Q162P" "R178A" "R178P" "R207P"
## [46] "T211P" "V161D" "V185D" "V185E" "V185F" "V185G" "V210D" "W194R"
p3 <- ggplot(sh3_final_df_ddg_2plot_allo, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.6) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  geom_text_repel(data = sh3_final_df_ddg_2plot_allo %>% filter(residuals < -0.5),
                  aes(label = id_ref), size = 3) +
  labs(
    title = "53 Allosteric Mutations",
    subtitle = "Spearman's rho = -0.55",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-18, 2.8) + xlim(-0.9, 3.1) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                       limits = c(-13.3, 5.4)) +
  theme(legend.position = "none")


p3
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

sh3_final_df_ddg_2plot_other <- sh3_final_df_ddg_2plot %>% filter(allosteric == FALSE) %>% filter(orthosteric == FALSE)
nrow(sh3_final_df_ddg_2plot_other)
## [1] 532
cor.test(sh3_final_df_ddg_2plot_other$f_ddg_pred, sh3_final_df_ddg_2plot_other$ESM1v, method = "spearman") #-0.768915 
## Warning in cor.test.default(sh3_final_df_ddg_2plot_other$f_ddg_pred,
## sh3_final_df_ddg_2plot_other$ESM1v, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  sh3_final_df_ddg_2plot_other$f_ddg_pred and sh3_final_df_ddg_2plot_other$ESM1v
## S = 44390403, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## -0.768915
p4 <- ggplot(sh3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
            inherit.aes = FALSE, color = "black", linewidth = 0.6) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  labs(
    title = "532 Other Mutations",
    subtitle = "Spearman's rho = -0.77",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-18, 2.8) + xlim(-0.9, 3.1) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                         limits = c(-13.3, 5.4)) +
  theme(legend.position = "none")

p4

figS3A <- plot_grid(p1,p2,p3, p4, ncol = 4, nrow=1)

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_scatter.pdf", 
       plot = figS3A, width = 12, height = 3, dpi = 300)
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
figS3A
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

figS3A <- ggplot(sh3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
  geom_point(size = 2, alpha = 0.35) +
  #geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
  #geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
  labs(
    title = "532 Other Mutations",
    subtitle = "Spearman's rho = -0.77",
    x = "Measured ddGf",
    y = "ESM1v",
    color = "Residuals"
  ) +
  theme_classic() +
  ylim(-18, 2.8) + xlim(-0.9, 3.1) +
  scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
                         limits = c(-13.3, 5.4)) +
  theme(legend.position = "bottom")

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_legend.pdf", 
       plot = figS3A, width = 12, height = 3, dpi = 300)
figS3A

sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))

sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(site_class = case_when(
    orthosteric == TRUE ~ "orthosteric",
    allosteric == TRUE ~ "allosteric",
    TRUE ~ "other"
  ))

label_df <- sh3_final_df_ddg_2plot %>%
  group_by(site_class) %>%
  summarise(
    n = n(),
    median_val = median(residuals, na.rm = TRUE),
    .groups = "drop"
  )

sh3_final_df_ddg_2plot$site_class <- factor(sh3_final_df_ddg_2plot$site_class, levels = c("orthosteric", "allosteric", "other"))

fig3C <- ggplot(sh3_final_df_ddg_2plot, aes(x = site_class, y = residuals, fill = site_class)) +
  geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA)+
  stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
  stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
  geom_text(
    data = label_df,
    aes(x = site_class, y = max(sh3_final_df_ddg_2plot$residuals) * 1.1,
        label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  geom_text(
    data = label_df,
    aes(x = site_class, y = 10,
        label = paste0("n = ", n)),
    inherit.aes = FALSE,
    size = 4
  ) +
  geom_text(
    data = label_df,
    aes(x = site_class, y = median_val + 1.9, label = sprintf("%.2f", median_val)),
    inherit.aes = FALSE,
    size = 6
  ) +
  geom_signif(
    comparisons = list(
      c("orthosteric", "allosteric"),
      c("orthosteric", "other"),
      c("allosteric", "other")
    ),
    map_signif_level = FALSE,
    test = "wilcox.test",
    step_increase = 0.1,
    tip_length = 0.01
  ) +
  
  # Labels and theme
  labs(
    title = "GRB2-SH3",
    subtitle = "",
    x = "",
    y = "ESM1v - ddGf residual"
  ) +
  scale_fill_manual(values = c(
    "orthosteric" = "orange",
    "allosteric" = "cyan3",
    "other" = "darkgreen"
  )) +
  theme_classic() +
  theme(legend.position = "none")

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_violin.pdf", 
       plot = fig3C, width = 3, height = 3, dpi = 300)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).
fig3C
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).

############################# MODEL 0: all mutations, ddGf + ddGa #############################
set.seed(11)
# Usage example
m0_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot, model_id = "m0")
## [1] "Adjusted R-squared: 0.5349"
## [1] "Adjusted R-squared 95% CI: 0.4851 - 0.5825"
m1_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot, model_id = "m1")
## [1] "Adjusted R-squared: 0.3613"
## [1] "Adjusted R-squared 95% CI: 0.2873 - 0.4255"
m2_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot, model_id = "m2")
## [1] "Adjusted R-squared: 0.1271"
## [1] "Adjusted R-squared 95% CI: 0.0417 - 0.2097"
############################# MODEL 3: non-active mutations, ddGf + ddGa #############################
table(sh3_final_df_ddg_2plot$orthosteric)
## 
## FALSE  TRUE 
##   585   171
sh3_final_df_ddg_2plot_non_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
nrow(sh3_final_df_ddg_2plot_non_active) #585
## [1] 585
m3_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Adjusted R-squared: 0.6672"
## [1] "Adjusted R-squared 95% CI: 0.6265 - 0.7046"
m4_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Adjusted R-squared: 0.6408"
## [1] "Adjusted R-squared 95% CI: 0.596 - 0.6816"
m5_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m5")
## [1] "Adjusted R-squared: 0.0447"
## [1] "Adjusted R-squared 95% CI: -0.0596 - 0.1449"
############################# MODEL 6: active mutations, ddGf + ddGa #############################
sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_active) #171
## [1] 171
m6_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Adjusted R-squared: 0.1814"
## [1] "Adjusted R-squared 95% CI: 0.0291 - 0.3055"
m7_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Adjusted R-squared: 0.1752"
## [1] "Adjusted R-squared 95% CI: -0.0103 - 0.3312"
m8_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot_active, model_id = "m8")
## [1] "Adjusted R-squared: 0.4067"
## [1] "Adjusted R-squared 95% CI: 0.2788 - 0.5141"
wilcox.test(abs(m6_results$adj_r2_values), abs(m7_results$adj_r2_values))
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  abs(m6_results$adj_r2_values) and abs(m7_results$adj_r2_values)
## W = 524635, p-value = 0.05643
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(
  subset(plot_data, category == "orthosteric" & type == "abundance")$adj_r2_values,
  subset(plot_data, category == "orthosteric" & type == "activity")$adj_r2_values
)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  subset(plot_data, category == "orthosteric" & type == "abundance")$adj_r2_values and subset(plot_data, category == "orthosteric" & type == "activity")$adj_r2_values
## W = 523942, p-value = 0.06373
## alternative hypothesis: true location shift is not equal to 0
anova(m0_results$fit, m1_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    753 5794.6                                  
## 2    754 7967.9 -1   -2173.3 282.41 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred
#   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
# 1    753 5794.6                                  
# 2    754 7967.9 -1   -2173.3 282.41 < 2.2e-16 ***
anova(m4_results$fit, m3_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    583 3265.3                                  
## 2    582 3019.8  1    245.48 47.311 1.568e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
#   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
# 1    583 3265.3                                  
# 2    582 3019.8  1    245.48 47.311 1.568e-11 ***
anova(m6_results$fit, m8_results$fit)
## Analysis of Variance Table
## 
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    169 1864.3                                  
## 2    168 1343.0  1    521.27 65.205 1.245e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
#   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
# 1    169 1864.3                                  
# 2    168 1343.0  1    521.27 65.205 1.245e-13 ***
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
  mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))

m0_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot, model_id = "m0")
## [1] "Marginal R-squared: 0.4035"
## [1] "Conditional R-squared: 0.7589"
#[1] "Marginal R-squared: 0.4035"
#[1] "Conditional R-squared: 0.7589"
m1_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot, model_id = "m1")
## [1] "Marginal R-squared: 0.3404"
## [1] "Conditional R-squared: 0.7687"
#Warning: Model failed to converge with max|grad| = 0.00504248 (tol = 0.002, component 1)[1] "Marginal R-squared: 0.3404"
#[1] "Conditional R-squared: 0.7687"
sh3_final_df_ddg_2plot_non_active <- sh3_final_df_ddg_2plot_non_active %>%
  mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))

m3_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Marginal R-squared: 0.6049"
## [1] "Conditional R-squared: 0.7721"
#[1] "Marginal R-squared: 0.6049"
#[1] "Conditional R-squared: 0.7721"
m4_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Marginal R-squared: 0.5667"
## [1] "Conditional R-squared: 0.7592"
#[1] "Marginal R-squared: 0.5667"
#[1] "Conditional R-squared: 0.7592"
sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot_active %>%
  mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))

m6_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Marginal R-squared: 0.1479"
## [1] "Conditional R-squared: 0.5509"
#[1] "Marginal R-squared: 0.1479"
#[1] "Conditional R-squared: 0.5509"
m7_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Marginal R-squared: 0.0958"
## [1] "Conditional R-squared: 0.594"
#[1] "Marginal R-squared: 0.0958"
#[1] "Conditional R-squared: 0.594"
anova(m1_results$fit, m0_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m1_results$fit    4 3339.9 3358.4  -1666    3331.9                         
## m0_results$fit    5 3304.1 3327.2  -1647    3294.1 37.858  1  7.609e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC logLik -2*log(L)  Chisq Df Pr(>Chisq)    
# m1_results$fit    4 3339.9 3358.4  -1666    3331.9                         
# m0_results$fit    5 3304.1 3327.2  -1647    3294.1 37.858  1  7.609e-10 ***

anova(m4_results$fit, m3_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
## m4_results$fit    4 2461.3 2478.8 -1226.7    2453.3                         
## m3_results$fit    5 2434.5 2456.4 -1212.3    2424.5 28.795  1  8.047e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Models:
# m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)    
# m4_results$fit    4 2461.3 2478.8 -1226.7    2453.3                         
# m3_results$fit    5 2434.5 2456.4 -1212.3    2424.5 28.795  1  8.047e-08 ***

anova(m7_results$fit, m6_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
##                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)  
## m7_results$fit    4 810.36 822.92 -401.18    802.36                       
## m6_results$fit    5 807.81 823.52 -398.91    797.81 4.5424  1    0.03307 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
#                npar    AIC    BIC  logLik -2*log(L)  Chisq Df Pr(>Chisq)  
# m7_results$fit    4 810.36 822.92 -401.18    802.36                       
# m6_results$fit    5 807.81 823.52 -398.91    797.81 4.5424  1    0.03307 *
# # Combine all adjusted_r_squared values into a single dataframe
# # Step 1: Build raw data (same as before)
# plot_data <- data.frame(
#   category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3000),
#   type = rep(c("abundance", "activity", "both"), times = 3, each = 1000),
#   adj_r2_values = c(
#     m1_results$adj_r2_values, m2_results$adj_r2_values, m0_results$adj_r2_values,
#     m4_results$adj_r2_values, m5_results$adj_r2_values, m3_results$adj_r2_values,
#     m6_results$adj_r2_values, m7_results$adj_r2_values, m8_results$adj_r2_values
#   )
# )
# 
# # Step 2: Build CI summary table
# ci_summary <- data.frame(
#   category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3),
#   type = rep(c("abundance", "activity", "both"), times = 3),
#   mean_adj_r2 = c(
#     mean(m1_results$adj_r2_values), mean(m2_results$adj_r2_values), m0_results$adjusted_r_squared,
#     mean(m4_results$adj_r2_values), mean(m5_results$adj_r2_values), mean(m3_results$adj_r2_values),
#     mean(m6_results$adj_r2_values), mean(m7_results$adj_r2_values), mean(m8_results$adj_r2_values)
#   ),
#   ci_lower = c(
#     quantile(m1_results$adj_r2_values, 0.025), quantile(m2_results$adj_r2_values, 0.025), 
#     quantile(m0_results$adj_r2_values, 0.025),
#     quantile(m4_results$adj_r2_values, 0.025), quantile(m5_results$adj_r2_values, 0.025), quantile(m3_results$adj_r2_values, 0.025),
#     quantile(m6_results$adj_r2_values, 0.025), quantile(m7_results$adj_r2_values, 0.025), quantile(m8_results$adj_r2_values, 0.025)
#   ),
#   ci_upper = c(
#     quantile(m1_results$adj_r2_values, 0.975), quantile(m2_results$adj_r2_values, 0.975), 
#     quantile(m0_results$adj_r2_values, 0.975),
#     quantile(m4_results$adj_r2_values, 0.975), quantile(m5_results$adj_r2_values, 0.975), quantile(m3_results$adj_r2_values, 0.975),
#     quantile(m6_results$adj_r2_values, 0.975), quantile(m7_results$adj_r2_values, 0.975), quantile(m8_results$adj_r2_values, 0.975)
#   )
# )
# 
# # Factor levels for consistency
# plot_data$category <- factor(plot_data$category, levels = c("whole protein", "non-orthosteric", "orthosteric"))
# plot_data$type <- factor(plot_data$type, levels = c("abundance", "activity", "both"))
# ci_summary$category <- factor(ci_summary$category, levels = levels(plot_data$category))
# ci_summary$type <- factor(ci_summary$type, levels = levels(plot_data$type))
# 
# # Step 3: Plot using CI summary for bars
# fig3E <- ggplot(ci_summary, aes(x = type, y = mean_adj_r2, fill = type)) +
#   geom_col(width = 0.6, alpha = 0.85) +
#   geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black") +
#   geom_text(aes(label = sprintf("%.3f", mean_adj_r2)), vjust = -1.5, size = 5, color = "black") +
#   facet_wrap(~category) +
#   scale_fill_brewer(palette = "Set2") +
#   theme_classic() +
#   theme(
#     legend.position = "none",
#     axis.line = element_line(linewidth = 0.6),
#     strip.text = element_text(size = 16, face = "bold"),
#     axis.text.x = element_text(size = 16, face = "bold", angle = 45, hjust = 1),
#     axis.text.y = element_text(size = 12),
#     axis.title.y = element_text(size = 18),
#     plot.title = element_text(size = 18, face = "bold"),
#     plot.subtitle = element_text(size = 16, face = "italic")
#   ) +
#   labs(
#     title = "GRB2-SH3", 
#     subtitle = "Linear regression against ESM1v", 
#     x = "", 
#     y = "Adjusted R²"
#   ) +
#   # Significance comparisons using full plot_data
# geom_signif(
#     data = subset(plot_data, category == "whole protein"),
#     aes(x = type, y = adj_r2_values),
#     comparisons = list(c("abundance", "both")),
#     annotations = c("p < 2.2e-16"),
#     y_position = c(0.85),
#     tip_length = 0.02,
#     step_increase = 0.05,
#     test = "wilcox.test",
#     textsize = 5
#   ) +
#   geom_signif(
#     data = subset(plot_data, category == "non-orthosteric"),
#     aes(x = type, y = adj_r2_values),
#     comparisons = list(c("abundance", "both")),
#     annotations = c("p = 1.568e-11"),
#     y_position = c(0.85),
#     tip_length = 0.02,
#     step_increase = 0.05,
#     test = "wilcox.test",
#     textsize = 5
#   ) +
#   geom_signif(
#     data = subset(plot_data, category == "orthosteric"),
#     aes(x = type, y = adj_r2_values),
#     comparisons = list(c("abundance", "both")),
#     annotations = c("p = 1.245e-13"),
#     y_position = c(0.85),
#     tip_length = 0.02,
#     step_increase = 0.05,
#     test = "wilcox.test",
#     textsize = 5
#   ) +
#   ylim(-0.1, 1)
# 
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_barplot.pdf", 
#        plot = fig3E, width = 8, height = 4, dpi = 300)
# fig3E
# sh3_out <- doubledeepms__minimum_interchain_distances_from_PDB_perres(
#   input_file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf.pdb",
#   chain_query = "A",
#   chain_target = "B"
# )
# 
# sh3_out
# head(sh3_final_df_ddg_2plot_all)
# merged_df <- merge(sh3_final_df_ddg_2plot_all, sh3_out, by.x="old_Pos.x", by.y = "Pos")
# nrow(merged_df) #1056
# 
# merged_df_pos <- merged_df %>% filter(b_ddg_pred >= 0)
# nrow(merged_df_pos)
# 
# merged_df_residue <- merged_df_pos %>%
#   group_by(old_Pos.x) %>%
#   reframe(
#     median_ddgb = median(b_ddg_pred, na.rm = TRUE),
#     median_residual = median(residuals, na.rm = TRUE),
#     scHAmin_ligand = first(scHAmin_ligand),
#     orthosteric = first(orthosteric),
#     Pos = first(old_Pos.x))
# 
# nrow(merged_df_residue) #55
# 
# # Fit exponential decay: y = a * exp(-b * x) + c
# exp_model_sh3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
#                  data = merged_df_residue,
#                  start = list(a = 1, b = 0.1))
# exp_model_sh3_ddgb
# # Nonlinear regression model
# #   model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
# #    data: merged_df_residue
# #      a      b 
# # 1.5129 0.1333 
# #  residual sum-of-squares: 8.253
# # 
# # Number of iterations to convergence: 6 
# # Achieved convergence tolerance: 5.017e-06
# 
# a <- 1.5129
# b <-  0.1333 
# half_d <- log(2) / b  # ≈ 5.199904
# half_d
# 
# y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(b_ddg_pred), na.rm=TRUE)
# 
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[abs(merged_df_residue$median_ddgb) <= y_cutoff] <- "Null"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
# 
# 
# x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
#               max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
# 
# # --- Bootstrapping for confidence intervals ---
# set.seed(11)
# boot_params <- replicate(1000, {
#   samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
#   fit <- try(nlsLM(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
#                    data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
#   if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
# })
# boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
# 
# boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
# 
# fit_df_residue <- data.frame(
#   scHAmin_ligand = x_vals,
#   median_ddgb = predict(exp_model_sh3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals)),
#   lower = apply(boot_preds, 1, quantile, probs = 0.025),
#   upper = apply(boot_preds, 1, quantile, probs = 0.975)
# )
# 
# # Plot
# p1 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
#   # Base layer: all points
#   geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
#    # Fit line with confidence ribbon
#   geom_ribbon(data = fit_df_residue,
#             aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
#             fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
#   geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
#           inherit.aes = FALSE, color = "black", size = 1) +
#   geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
#   
#   # Manual color palette for site type
#   scale_color_manual(values = c(
#     "Null" = "darkgreen",
#     "Non-orthosteric site" = "darkgreen",
#     "Binding site" = "orange"
#   )) +
#   # Reference lines
#   geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
#   theme_classic() +
#   labs(
#     title = "GRB2-SH3: allosteric decay",
#     subtitle = "370 mutations with ddGb >=0",
#     x = "Minimal distance to ligand",
#     y = "Median ddGb",
#     color = ""
#   )  + theme(legend.position = "bottom") +
#     annotate("text",  x = Inf, y = Inf,
#              hjust = 1, vjust = 1,
#            label = "y = a * exp (b * x)\na = 1.5129  \nb = -0.1333",
#            size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
# 
# p1 <- ggMarginal(
#   p1,
#   type = "density",
#   margins = "both",
#   groupColour = FALSE,
#   groupFill = FALSE,
#   size = 10,
#   colour = "grey",
#   fill = "lightgrey"
# )
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_ddgb_decay.pdf", 
#        plot = p1, width = 4, height = 4, dpi = 300)
# p1
# lm_model <- lm(log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
# summary(lm_model)
# Call:
# lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
# 
# Residuals:
#      Min       1Q   Median       3Q      Max 
# -2.06277 -0.49682  0.05624  0.75934  1.81772 
# 
# Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
# (Intercept)    -0.03660    0.27318  -0.134    0.894    
# scHAmin_ligand -0.11262    0.02514  -4.480 4.02e-05 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.8877 on 53 degrees of freedom
# Multiple R-squared:  0.2746,  Adjusted R-squared:  0.261 
# F-statistic: 20.07 on 1 and 53 DF,  p-value: 4.018e-05
# merged_df <- merge(sh3_final_df_ddg_2plot_all, sh3_out, by.x="old_Pos.x", by.y = "Pos")
# nrow(merged_df) #1056
# 
# merged_df_neg <- merged_df %>% filter(residuals <= 0)
# nrow(merged_df_neg) #681
# 
# merged_df_residue <- merged_df_neg %>%
#   group_by(Pos_ref.x) %>%
#   reframe(
#     median_ddgb = median(b_ddg_pred, na.rm = TRUE),
#     median_residual = median(residuals, na.rm = TRUE),
#     scHAmin_ligand = first(scHAmin_ligand),
#     orthosteric = first(orthosteric),
#     Pos = first(old_Pos.x))
# 
# nrow(merged_df_residue) #
# 
# # Fit exponential decay: y = a * exp(-b * x) + c
# exp_model_sh3_res <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
#                  data = merged_df_residue,
#                  start = list(a = 1, b = 0.1))
# exp_model_sh3_res
# # Nonlinear regression model
# #   model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
# #    data: merged_df_residue
# #      a      b 
# # 8.8472 0.1564 
# #  residual sum-of-squares: 131.2
# # 
# # Number of iterations to convergence: 10 
# # Achieved convergence tolerance: 3.291e-06
# 
# a <- 8.8472
# b <-  0.1564
# half_d <- log(2) / b  # ≈ 4.431887
# half_d
# 
# y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(residuals), na.rm = TRUE)
# 
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[abs(merged_df_residue$median_residual) <= abs(y_cutoff)] <- "Null"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
# table(merged_df_residue$site_type)
# 
# x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
#               max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
# 
# # --- Bootstrapping for confidence intervals ---
# set.seed(11)
# boot_params <- replicate(1000, {
#   samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
#   fit <- try(nlsLM(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
#                    data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
#   if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
# })
# boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
# 
# boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
# 
# fit_df_residue <- data.frame(
#   scHAmin_ligand = x_vals,
#   median_ddgb = predict(exp_model_sh3_res, newdata = data.frame(scHAmin_ligand = x_vals)),
#   lower = apply(boot_preds, 1, quantile, probs = 0.025),
#   upper = apply(boot_preds, 1, quantile, probs = 0.975)
# )
# # Plot
# p2 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
#   # Base layer: all points
#   geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
#   geom_ribbon(data = fit_df_residue,
#             aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
#             fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
#   geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
#           inherit.aes = FALSE, color = "black", size = 1) +
#   geom_text_repel(data = merged_df_residue %>% filter(abs(median_residual) > 2), aes(label=Pos))+
#   
#   # Manual color palette for site type
#   scale_color_manual(values = c(
#     "Null" = "darkgreen",
#     "Non-orthosteric site" = "darkgreen",
#     "Binding site" = "orange"
#   )) +
#   # Reference lines
#   geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
#   theme_classic() +
#   labs(
#     title = "GRB2-SH3: allosteric decay",
#     subtitle = "681 mutations with residuals <= 0",
#     x = "Minimal distance to ligand",
#     y = "|Median ESM1v-ddGf residual|",
#     color = ""
#   )  + theme(legend.position = "bottom") +
#     annotate("text",  x = Inf, y = Inf,
#              hjust = 1, vjust = 1,
#            label = "y = a * exp (b * x)\na = 8.8472  \nb = -0.1564",
#            size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
# 
# p2 <- ggMarginal(
#   p2,
#   type = "density",
#   margins = "both",
#   groupColour = FALSE,
#   groupFill = FALSE,
#   size = 10,
#   colour = "grey",
#   fill = "lightgrey"
# )
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_residual_decay.pdf", 
#        plot = p2, width = 4, height = 4, dpi = 300)
# p2
# lm_model <- lm(log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
# summary(lm_model)
# Call:
# lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
# 
# Residuals:
#      Min       1Q   Median       3Q      Max 
# -1.52127 -0.44230 -0.00484  0.58176  1.09505 
# 
# Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
# (Intercept)     1.55124    0.20309   7.638 5.37e-10 ***
# scHAmin_ligand -0.09112    0.01942  -4.692 2.06e-05 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 
# Residual standard error: 0.6651 on 51 degrees of freedom
# Multiple R-squared:  0.3015,  Adjusted R-squared:  0.2878 
# F-statistic: 22.02 on 1 and 51 DF,  p-value: 2.064e-05
# nrow(merged_df) #1056
# 
# merged_df_residue <- merged_df %>%
#   group_by(Pos_ref.x) %>%
#   reframe(
#     median_ddgb = median(b_ddg_pred, na.rm = TRUE),
#     median_residual = median(residuals, na.rm = TRUE),
#     scHAmin_ligand = first(scHAmin_ligand),
#     orthosteric = first(orthosteric),
#     Pos = first(Pos_ref.x))
# 
# nrow(merged_df_residue) #56
# cor.test(merged_df_residue$median_residual, merged_df_residue$median_ddgb, method = "spearman")
# #     Spearman's rank correlation rho
# # 
# # data:  merged_df_residue$median_residual and merged_df_residue$median_ddgb
# # S = 49534, p-value = 9.813e-09
# # alternative hypothesis: true rho is not equal to 0
# # sample estimates:
# #        rho 
# # -0.6928913 
# 
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
# 
# figS3E <- ggplot(merged_df_residue, aes(x = median_ddgb, y =  median_residual, color = site_type)) +
#   geom_point(size = 2, alpha = 0.5) +
#   labs(
#     title = "GRB2-SH3: 56 residues (all 1056 mutations)",
#     subtitle = "Spearman's rho = -0.70",
#     x = "Median ddGb",
#     y = "Median ESM1v-ddGf residual",
#     color = "") +
#   theme_classic() +
#     scale_color_manual(values = c(
#     "Non-orthosteric site" = "darkgreen",
#     "Binding site" = "orange"
#   )) + theme(legend.position = "bottom") +
#   geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") +
#   geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") 
#   #xlim(-3,2) + ylim(-5,5)
# 
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_corr.pdf", 
#        plot = figS3E, width = 4, height = 5, dpi = 300)
# figS3E